1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 10 #include "cpuid.h" 11 #include "hyperv.h" 12 #include "mmu.h" 13 #include "nested.h" 14 #include "pmu.h" 15 #include "sgx.h" 16 #include "trace.h" 17 #include "vmx.h" 18 #include "x86.h" 19 #include "smm.h" 20 21 static bool __read_mostly enable_shadow_vmcs = 1; 22 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 23 24 static bool __read_mostly nested_early_check = 0; 25 module_param(nested_early_check, bool, S_IRUGO); 26 27 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 28 29 /* 30 * Hyper-V requires all of these, so mark them as supported even though 31 * they are just treated the same as all-context. 32 */ 33 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 34 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 36 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 38 39 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 40 41 enum { 42 VMX_VMREAD_BITMAP, 43 VMX_VMWRITE_BITMAP, 44 VMX_BITMAP_NR 45 }; 46 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 47 48 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 49 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 50 51 struct shadow_vmcs_field { 52 u16 encoding; 53 u16 offset; 54 }; 55 static struct shadow_vmcs_field shadow_read_only_fields[] = { 56 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 57 #include "vmcs_shadow_fields.h" 58 }; 59 static int max_shadow_read_only_fields = 60 ARRAY_SIZE(shadow_read_only_fields); 61 62 static struct shadow_vmcs_field shadow_read_write_fields[] = { 63 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 64 #include "vmcs_shadow_fields.h" 65 }; 66 static int max_shadow_read_write_fields = 67 ARRAY_SIZE(shadow_read_write_fields); 68 69 static void init_vmcs_shadow_fields(void) 70 { 71 int i, j; 72 73 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 74 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 75 76 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 77 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 78 u16 field = entry.encoding; 79 80 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 81 (i + 1 == max_shadow_read_only_fields || 82 shadow_read_only_fields[i + 1].encoding != field + 1)) 83 pr_err("Missing field from shadow_read_only_field %x\n", 84 field + 1); 85 86 clear_bit(field, vmx_vmread_bitmap); 87 if (field & 1) 88 #ifdef CONFIG_X86_64 89 continue; 90 #else 91 entry.offset += sizeof(u32); 92 #endif 93 shadow_read_only_fields[j++] = entry; 94 } 95 max_shadow_read_only_fields = j; 96 97 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 98 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 99 u16 field = entry.encoding; 100 101 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 102 (i + 1 == max_shadow_read_write_fields || 103 shadow_read_write_fields[i + 1].encoding != field + 1)) 104 pr_err("Missing field from shadow_read_write_field %x\n", 105 field + 1); 106 107 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 108 field <= GUEST_TR_AR_BYTES, 109 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 110 111 /* 112 * PML and the preemption timer can be emulated, but the 113 * processor cannot vmwrite to fields that don't exist 114 * on bare metal. 115 */ 116 switch (field) { 117 case GUEST_PML_INDEX: 118 if (!cpu_has_vmx_pml()) 119 continue; 120 break; 121 case VMX_PREEMPTION_TIMER_VALUE: 122 if (!cpu_has_vmx_preemption_timer()) 123 continue; 124 break; 125 case GUEST_INTR_STATUS: 126 if (!cpu_has_vmx_apicv()) 127 continue; 128 break; 129 default: 130 break; 131 } 132 133 clear_bit(field, vmx_vmwrite_bitmap); 134 clear_bit(field, vmx_vmread_bitmap); 135 if (field & 1) 136 #ifdef CONFIG_X86_64 137 continue; 138 #else 139 entry.offset += sizeof(u32); 140 #endif 141 shadow_read_write_fields[j++] = entry; 142 } 143 max_shadow_read_write_fields = j; 144 } 145 146 /* 147 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 148 * set the success or error code of an emulated VMX instruction (as specified 149 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 150 * instruction. 151 */ 152 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 153 { 154 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 155 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 156 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 157 return kvm_skip_emulated_instruction(vcpu); 158 } 159 160 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 161 { 162 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 163 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 164 X86_EFLAGS_SF | X86_EFLAGS_OF)) 165 | X86_EFLAGS_CF); 166 return kvm_skip_emulated_instruction(vcpu); 167 } 168 169 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 170 u32 vm_instruction_error) 171 { 172 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 173 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 174 X86_EFLAGS_SF | X86_EFLAGS_OF)) 175 | X86_EFLAGS_ZF); 176 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 177 /* 178 * We don't need to force sync to shadow VMCS because 179 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 180 * fields and thus must be synced. 181 */ 182 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 183 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 184 185 return kvm_skip_emulated_instruction(vcpu); 186 } 187 188 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 189 { 190 struct vcpu_vmx *vmx = to_vmx(vcpu); 191 192 /* 193 * failValid writes the error number to the current VMCS, which 194 * can't be done if there isn't a current VMCS. 195 */ 196 if (vmx->nested.current_vmptr == INVALID_GPA && 197 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 198 return nested_vmx_failInvalid(vcpu); 199 200 return nested_vmx_failValid(vcpu, vm_instruction_error); 201 } 202 203 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 204 { 205 /* TODO: not to reset guest simply here. */ 206 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 207 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 208 } 209 210 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 211 { 212 return fixed_bits_valid(control, low, high); 213 } 214 215 static inline u64 vmx_control_msr(u32 low, u32 high) 216 { 217 return low | ((u64)high << 32); 218 } 219 220 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 221 { 222 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 223 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 224 vmx->nested.need_vmcs12_to_shadow_sync = false; 225 } 226 227 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 228 { 229 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 230 struct vcpu_vmx *vmx = to_vmx(vcpu); 231 232 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 233 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 234 vmx->nested.hv_evmcs = NULL; 235 } 236 237 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 238 239 if (hv_vcpu) { 240 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 241 hv_vcpu->nested.vm_id = 0; 242 hv_vcpu->nested.vp_id = 0; 243 } 244 } 245 246 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 247 struct loaded_vmcs *prev) 248 { 249 struct vmcs_host_state *dest, *src; 250 251 if (unlikely(!vmx->guest_state_loaded)) 252 return; 253 254 src = &prev->host_state; 255 dest = &vmx->loaded_vmcs->host_state; 256 257 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 258 dest->ldt_sel = src->ldt_sel; 259 #ifdef CONFIG_X86_64 260 dest->ds_sel = src->ds_sel; 261 dest->es_sel = src->es_sel; 262 #endif 263 } 264 265 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 266 { 267 struct vcpu_vmx *vmx = to_vmx(vcpu); 268 struct loaded_vmcs *prev; 269 int cpu; 270 271 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 272 return; 273 274 cpu = get_cpu(); 275 prev = vmx->loaded_vmcs; 276 vmx->loaded_vmcs = vmcs; 277 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 278 vmx_sync_vmcs_host_state(vmx, prev); 279 put_cpu(); 280 281 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 282 283 /* 284 * All lazily updated registers will be reloaded from VMCS12 on both 285 * vmentry and vmexit. 286 */ 287 vcpu->arch.regs_dirty = 0; 288 } 289 290 /* 291 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 292 * just stops using VMX. 293 */ 294 static void free_nested(struct kvm_vcpu *vcpu) 295 { 296 struct vcpu_vmx *vmx = to_vmx(vcpu); 297 298 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 299 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 300 301 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 302 return; 303 304 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 305 306 vmx->nested.vmxon = false; 307 vmx->nested.smm.vmxon = false; 308 vmx->nested.vmxon_ptr = INVALID_GPA; 309 free_vpid(vmx->nested.vpid02); 310 vmx->nested.posted_intr_nv = -1; 311 vmx->nested.current_vmptr = INVALID_GPA; 312 if (enable_shadow_vmcs) { 313 vmx_disable_shadow_vmcs(vmx); 314 vmcs_clear(vmx->vmcs01.shadow_vmcs); 315 free_vmcs(vmx->vmcs01.shadow_vmcs); 316 vmx->vmcs01.shadow_vmcs = NULL; 317 } 318 kfree(vmx->nested.cached_vmcs12); 319 vmx->nested.cached_vmcs12 = NULL; 320 kfree(vmx->nested.cached_shadow_vmcs12); 321 vmx->nested.cached_shadow_vmcs12 = NULL; 322 /* 323 * Unpin physical memory we referred to in the vmcs02. The APIC access 324 * page's backing page (yeah, confusing) shouldn't actually be accessed, 325 * and if it is written, the contents are irrelevant. 326 */ 327 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 328 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 329 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 330 vmx->nested.pi_desc = NULL; 331 332 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 333 334 nested_release_evmcs(vcpu); 335 336 free_loaded_vmcs(&vmx->nested.vmcs02); 337 } 338 339 /* 340 * Ensure that the current vmcs of the logical processor is the 341 * vmcs01 of the vcpu before calling free_nested(). 342 */ 343 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 344 { 345 vcpu_load(vcpu); 346 vmx_leave_nested(vcpu); 347 vcpu_put(vcpu); 348 } 349 350 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 351 352 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 353 { 354 return VALID_PAGE(root_hpa) && 355 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 356 } 357 358 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 359 gpa_t addr) 360 { 361 unsigned long roots = 0; 362 uint i; 363 struct kvm_mmu_root_info *cached_root; 364 365 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 366 367 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 368 cached_root = &vcpu->arch.mmu->prev_roots[i]; 369 370 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 371 eptp)) 372 roots |= KVM_MMU_ROOT_PREVIOUS(i); 373 } 374 if (roots) 375 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 376 } 377 378 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 379 struct x86_exception *fault) 380 { 381 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 382 struct vcpu_vmx *vmx = to_vmx(vcpu); 383 u32 vm_exit_reason; 384 unsigned long exit_qualification = vcpu->arch.exit_qualification; 385 386 if (vmx->nested.pml_full) { 387 vm_exit_reason = EXIT_REASON_PML_FULL; 388 vmx->nested.pml_full = false; 389 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 390 } else { 391 if (fault->error_code & PFERR_RSVD_MASK) 392 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 393 else 394 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 395 396 /* 397 * Although the caller (kvm_inject_emulated_page_fault) would 398 * have already synced the faulting address in the shadow EPT 399 * tables for the current EPTP12, we also need to sync it for 400 * any other cached EPTP02s based on the same EP4TA, since the 401 * TLB associates mappings to the EP4TA rather than the full EPTP. 402 */ 403 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 404 fault->address); 405 } 406 407 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 408 vmcs12->guest_physical_address = fault->address; 409 } 410 411 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 412 { 413 struct vcpu_vmx *vmx = to_vmx(vcpu); 414 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 415 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 416 417 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 418 nested_ept_ad_enabled(vcpu), 419 nested_ept_get_eptp(vcpu)); 420 } 421 422 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 423 { 424 WARN_ON(mmu_is_nested(vcpu)); 425 426 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 427 nested_ept_new_eptp(vcpu); 428 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 429 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 430 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 431 432 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 433 } 434 435 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 436 { 437 vcpu->arch.mmu = &vcpu->arch.root_mmu; 438 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 439 } 440 441 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 442 u16 error_code) 443 { 444 bool inequality, bit; 445 446 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 447 inequality = 448 (error_code & vmcs12->page_fault_error_code_mask) != 449 vmcs12->page_fault_error_code_match; 450 return inequality ^ bit; 451 } 452 453 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 454 u32 error_code) 455 { 456 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 457 458 /* 459 * Drop bits 31:16 of the error code when performing the #PF mask+match 460 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 461 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 462 * error code. Including the to-be-dropped bits in the check might 463 * result in an "impossible" or missed exit from L1's perspective. 464 */ 465 if (vector == PF_VECTOR) 466 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 467 468 return (vmcs12->exception_bitmap & (1u << vector)); 469 } 470 471 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 472 struct vmcs12 *vmcs12) 473 { 474 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 475 return 0; 476 477 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 478 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 479 return -EINVAL; 480 481 return 0; 482 } 483 484 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 485 struct vmcs12 *vmcs12) 486 { 487 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 488 return 0; 489 490 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 491 return -EINVAL; 492 493 return 0; 494 } 495 496 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 497 struct vmcs12 *vmcs12) 498 { 499 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 500 return 0; 501 502 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 503 return -EINVAL; 504 505 return 0; 506 } 507 508 /* 509 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 510 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 511 * only the "disable intercept" case needs to be handled. 512 */ 513 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 514 unsigned long *msr_bitmap_l0, 515 u32 msr, int type) 516 { 517 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 518 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 519 520 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 521 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 522 } 523 524 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 525 { 526 int msr; 527 528 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 529 unsigned word = msr / BITS_PER_LONG; 530 531 msr_bitmap[word] = ~0; 532 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 533 } 534 } 535 536 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 537 static inline \ 538 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 539 unsigned long *msr_bitmap_l1, \ 540 unsigned long *msr_bitmap_l0, u32 msr) \ 541 { \ 542 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 543 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 544 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 545 else \ 546 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 547 } 548 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 549 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 550 551 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 552 unsigned long *msr_bitmap_l1, 553 unsigned long *msr_bitmap_l0, 554 u32 msr, int types) 555 { 556 if (types & MSR_TYPE_R) 557 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 558 msr_bitmap_l0, msr); 559 if (types & MSR_TYPE_W) 560 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 561 msr_bitmap_l0, msr); 562 } 563 564 /* 565 * Merge L0's and L1's MSR bitmap, return false to indicate that 566 * we do not use the hardware. 567 */ 568 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 569 struct vmcs12 *vmcs12) 570 { 571 struct vcpu_vmx *vmx = to_vmx(vcpu); 572 int msr; 573 unsigned long *msr_bitmap_l1; 574 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 575 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 576 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 577 578 /* Nothing to do if the MSR bitmap is not in use. */ 579 if (!cpu_has_vmx_msr_bitmap() || 580 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 581 return false; 582 583 /* 584 * MSR bitmap update can be skipped when: 585 * - MSR bitmap for L1 hasn't changed. 586 * - Nested hypervisor (L1) is attempting to launch the same L2 as 587 * before. 588 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 589 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 590 */ 591 if (!vmx->nested.force_msr_bitmap_recalc && evmcs && 592 evmcs->hv_enlightenments_control.msr_bitmap && 593 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 594 return true; 595 596 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 597 return false; 598 599 msr_bitmap_l1 = (unsigned long *)map->hva; 600 601 /* 602 * To keep the control flow simple, pay eight 8-byte writes (sixteen 603 * 4-byte writes on 32-bit systems) up front to enable intercepts for 604 * the x2APIC MSR range and selectively toggle those relevant to L2. 605 */ 606 enable_x2apic_msr_intercepts(msr_bitmap_l0); 607 608 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 609 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 610 /* 611 * L0 need not intercept reads for MSRs between 0x800 612 * and 0x8ff, it just lets the processor take the value 613 * from the virtual-APIC page; take those 256 bits 614 * directly from the L1 bitmap. 615 */ 616 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 617 unsigned word = msr / BITS_PER_LONG; 618 619 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 620 } 621 } 622 623 nested_vmx_disable_intercept_for_x2apic_msr( 624 msr_bitmap_l1, msr_bitmap_l0, 625 X2APIC_MSR(APIC_TASKPRI), 626 MSR_TYPE_R | MSR_TYPE_W); 627 628 if (nested_cpu_has_vid(vmcs12)) { 629 nested_vmx_disable_intercept_for_x2apic_msr( 630 msr_bitmap_l1, msr_bitmap_l0, 631 X2APIC_MSR(APIC_EOI), 632 MSR_TYPE_W); 633 nested_vmx_disable_intercept_for_x2apic_msr( 634 msr_bitmap_l1, msr_bitmap_l0, 635 X2APIC_MSR(APIC_SELF_IPI), 636 MSR_TYPE_W); 637 } 638 } 639 640 /* 641 * Always check vmcs01's bitmap to honor userspace MSR filters and any 642 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 643 */ 644 #ifdef CONFIG_X86_64 645 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 646 MSR_FS_BASE, MSR_TYPE_RW); 647 648 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 649 MSR_GS_BASE, MSR_TYPE_RW); 650 651 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 652 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 653 #endif 654 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 655 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 656 657 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 658 MSR_IA32_PRED_CMD, MSR_TYPE_W); 659 660 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 661 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 662 663 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 664 665 vmx->nested.force_msr_bitmap_recalc = false; 666 667 return true; 668 } 669 670 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 671 struct vmcs12 *vmcs12) 672 { 673 struct vcpu_vmx *vmx = to_vmx(vcpu); 674 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 675 676 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 677 vmcs12->vmcs_link_pointer == INVALID_GPA) 678 return; 679 680 if (ghc->gpa != vmcs12->vmcs_link_pointer && 681 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 682 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 683 return; 684 685 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 686 VMCS12_SIZE); 687 } 688 689 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 690 struct vmcs12 *vmcs12) 691 { 692 struct vcpu_vmx *vmx = to_vmx(vcpu); 693 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 694 695 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 696 vmcs12->vmcs_link_pointer == INVALID_GPA) 697 return; 698 699 if (ghc->gpa != vmcs12->vmcs_link_pointer && 700 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 701 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 702 return; 703 704 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 705 VMCS12_SIZE); 706 } 707 708 /* 709 * In nested virtualization, check if L1 has set 710 * VM_EXIT_ACK_INTR_ON_EXIT 711 */ 712 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 713 { 714 return get_vmcs12(vcpu)->vm_exit_controls & 715 VM_EXIT_ACK_INTR_ON_EXIT; 716 } 717 718 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 719 struct vmcs12 *vmcs12) 720 { 721 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 722 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 723 return -EINVAL; 724 else 725 return 0; 726 } 727 728 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 729 struct vmcs12 *vmcs12) 730 { 731 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 732 !nested_cpu_has_apic_reg_virt(vmcs12) && 733 !nested_cpu_has_vid(vmcs12) && 734 !nested_cpu_has_posted_intr(vmcs12)) 735 return 0; 736 737 /* 738 * If virtualize x2apic mode is enabled, 739 * virtualize apic access must be disabled. 740 */ 741 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 742 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 743 return -EINVAL; 744 745 /* 746 * If virtual interrupt delivery is enabled, 747 * we must exit on external interrupts. 748 */ 749 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 750 return -EINVAL; 751 752 /* 753 * bits 15:8 should be zero in posted_intr_nv, 754 * the descriptor address has been already checked 755 * in nested_get_vmcs12_pages. 756 * 757 * bits 5:0 of posted_intr_desc_addr should be zero. 758 */ 759 if (nested_cpu_has_posted_intr(vmcs12) && 760 (CC(!nested_cpu_has_vid(vmcs12)) || 761 CC(!nested_exit_intr_ack_set(vcpu)) || 762 CC((vmcs12->posted_intr_nv & 0xff00)) || 763 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 764 return -EINVAL; 765 766 /* tpr shadow is needed by all apicv features. */ 767 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 768 return -EINVAL; 769 770 return 0; 771 } 772 773 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 774 u32 count, u64 addr) 775 { 776 if (count == 0) 777 return 0; 778 779 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 780 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 781 return -EINVAL; 782 783 return 0; 784 } 785 786 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 787 struct vmcs12 *vmcs12) 788 { 789 if (CC(nested_vmx_check_msr_switch(vcpu, 790 vmcs12->vm_exit_msr_load_count, 791 vmcs12->vm_exit_msr_load_addr)) || 792 CC(nested_vmx_check_msr_switch(vcpu, 793 vmcs12->vm_exit_msr_store_count, 794 vmcs12->vm_exit_msr_store_addr))) 795 return -EINVAL; 796 797 return 0; 798 } 799 800 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 801 struct vmcs12 *vmcs12) 802 { 803 if (CC(nested_vmx_check_msr_switch(vcpu, 804 vmcs12->vm_entry_msr_load_count, 805 vmcs12->vm_entry_msr_load_addr))) 806 return -EINVAL; 807 808 return 0; 809 } 810 811 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 812 struct vmcs12 *vmcs12) 813 { 814 if (!nested_cpu_has_pml(vmcs12)) 815 return 0; 816 817 if (CC(!nested_cpu_has_ept(vmcs12)) || 818 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 819 return -EINVAL; 820 821 return 0; 822 } 823 824 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 825 struct vmcs12 *vmcs12) 826 { 827 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 828 !nested_cpu_has_ept(vmcs12))) 829 return -EINVAL; 830 return 0; 831 } 832 833 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 834 struct vmcs12 *vmcs12) 835 { 836 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 837 !nested_cpu_has_ept(vmcs12))) 838 return -EINVAL; 839 return 0; 840 } 841 842 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 843 struct vmcs12 *vmcs12) 844 { 845 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 846 return 0; 847 848 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 849 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 850 return -EINVAL; 851 852 return 0; 853 } 854 855 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 856 struct vmx_msr_entry *e) 857 { 858 /* x2APIC MSR accesses are not allowed */ 859 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 860 return -EINVAL; 861 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 862 CC(e->index == MSR_IA32_UCODE_REV)) 863 return -EINVAL; 864 if (CC(e->reserved != 0)) 865 return -EINVAL; 866 return 0; 867 } 868 869 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 870 struct vmx_msr_entry *e) 871 { 872 if (CC(e->index == MSR_FS_BASE) || 873 CC(e->index == MSR_GS_BASE) || 874 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 875 nested_vmx_msr_check_common(vcpu, e)) 876 return -EINVAL; 877 return 0; 878 } 879 880 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 881 struct vmx_msr_entry *e) 882 { 883 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 884 nested_vmx_msr_check_common(vcpu, e)) 885 return -EINVAL; 886 return 0; 887 } 888 889 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 890 { 891 struct vcpu_vmx *vmx = to_vmx(vcpu); 892 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 893 vmx->nested.msrs.misc_high); 894 895 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 896 } 897 898 /* 899 * Load guest's/host's msr at nested entry/exit. 900 * return 0 for success, entry index for failure. 901 * 902 * One of the failure modes for MSR load/store is when a list exceeds the 903 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 904 * as possible, process all valid entries before failing rather than precheck 905 * for a capacity violation. 906 */ 907 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 908 { 909 u32 i; 910 struct vmx_msr_entry e; 911 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 912 913 for (i = 0; i < count; i++) { 914 if (unlikely(i >= max_msr_list_size)) 915 goto fail; 916 917 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 918 &e, sizeof(e))) { 919 pr_debug_ratelimited( 920 "%s cannot read MSR entry (%u, 0x%08llx)\n", 921 __func__, i, gpa + i * sizeof(e)); 922 goto fail; 923 } 924 if (nested_vmx_load_msr_check(vcpu, &e)) { 925 pr_debug_ratelimited( 926 "%s check failed (%u, 0x%x, 0x%x)\n", 927 __func__, i, e.index, e.reserved); 928 goto fail; 929 } 930 if (kvm_set_msr(vcpu, e.index, e.value)) { 931 pr_debug_ratelimited( 932 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 933 __func__, i, e.index, e.value); 934 goto fail; 935 } 936 } 937 return 0; 938 fail: 939 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 940 return i + 1; 941 } 942 943 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 944 u32 msr_index, 945 u64 *data) 946 { 947 struct vcpu_vmx *vmx = to_vmx(vcpu); 948 949 /* 950 * If the L0 hypervisor stored a more accurate value for the TSC that 951 * does not include the time taken for emulation of the L2->L1 952 * VM-exit in L0, use the more accurate value. 953 */ 954 if (msr_index == MSR_IA32_TSC) { 955 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 956 MSR_IA32_TSC); 957 958 if (i >= 0) { 959 u64 val = vmx->msr_autostore.guest.val[i].value; 960 961 *data = kvm_read_l1_tsc(vcpu, val); 962 return true; 963 } 964 } 965 966 if (kvm_get_msr(vcpu, msr_index, data)) { 967 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 968 msr_index); 969 return false; 970 } 971 return true; 972 } 973 974 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 975 struct vmx_msr_entry *e) 976 { 977 if (kvm_vcpu_read_guest(vcpu, 978 gpa + i * sizeof(*e), 979 e, 2 * sizeof(u32))) { 980 pr_debug_ratelimited( 981 "%s cannot read MSR entry (%u, 0x%08llx)\n", 982 __func__, i, gpa + i * sizeof(*e)); 983 return false; 984 } 985 if (nested_vmx_store_msr_check(vcpu, e)) { 986 pr_debug_ratelimited( 987 "%s check failed (%u, 0x%x, 0x%x)\n", 988 __func__, i, e->index, e->reserved); 989 return false; 990 } 991 return true; 992 } 993 994 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 995 { 996 u64 data; 997 u32 i; 998 struct vmx_msr_entry e; 999 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1000 1001 for (i = 0; i < count; i++) { 1002 if (unlikely(i >= max_msr_list_size)) 1003 return -EINVAL; 1004 1005 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1006 return -EINVAL; 1007 1008 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1009 return -EINVAL; 1010 1011 if (kvm_vcpu_write_guest(vcpu, 1012 gpa + i * sizeof(e) + 1013 offsetof(struct vmx_msr_entry, value), 1014 &data, sizeof(data))) { 1015 pr_debug_ratelimited( 1016 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1017 __func__, i, e.index, data); 1018 return -EINVAL; 1019 } 1020 } 1021 return 0; 1022 } 1023 1024 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1025 { 1026 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1027 u32 count = vmcs12->vm_exit_msr_store_count; 1028 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1029 struct vmx_msr_entry e; 1030 u32 i; 1031 1032 for (i = 0; i < count; i++) { 1033 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1034 return false; 1035 1036 if (e.index == msr_index) 1037 return true; 1038 } 1039 return false; 1040 } 1041 1042 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1043 u32 msr_index) 1044 { 1045 struct vcpu_vmx *vmx = to_vmx(vcpu); 1046 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1047 bool in_vmcs12_store_list; 1048 int msr_autostore_slot; 1049 bool in_autostore_list; 1050 int last; 1051 1052 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1053 in_autostore_list = msr_autostore_slot >= 0; 1054 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1055 1056 if (in_vmcs12_store_list && !in_autostore_list) { 1057 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1058 /* 1059 * Emulated VMEntry does not fail here. Instead a less 1060 * accurate value will be returned by 1061 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1062 * instead of reading the value from the vmcs02 VMExit 1063 * MSR-store area. 1064 */ 1065 pr_warn_ratelimited( 1066 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1067 msr_index); 1068 return; 1069 } 1070 last = autostore->nr++; 1071 autostore->val[last].index = msr_index; 1072 } else if (!in_vmcs12_store_list && in_autostore_list) { 1073 last = --autostore->nr; 1074 autostore->val[msr_autostore_slot] = autostore->val[last]; 1075 } 1076 } 1077 1078 /* 1079 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1080 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1081 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1082 * @entry_failure_code. 1083 */ 1084 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1085 bool nested_ept, bool reload_pdptrs, 1086 enum vm_entry_failure_code *entry_failure_code) 1087 { 1088 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1089 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1090 return -EINVAL; 1091 } 1092 1093 /* 1094 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1095 * must not be dereferenced. 1096 */ 1097 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1098 CC(!load_pdptrs(vcpu, cr3))) { 1099 *entry_failure_code = ENTRY_FAIL_PDPTE; 1100 return -EINVAL; 1101 } 1102 1103 vcpu->arch.cr3 = cr3; 1104 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1105 1106 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1107 kvm_init_mmu(vcpu); 1108 1109 if (!nested_ept) 1110 kvm_mmu_new_pgd(vcpu, cr3); 1111 1112 return 0; 1113 } 1114 1115 /* 1116 * Returns if KVM is able to config CPU to tag TLB entries 1117 * populated by L2 differently than TLB entries populated 1118 * by L1. 1119 * 1120 * If L0 uses EPT, L1 and L2 run with different EPTP because 1121 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1122 * are tagged with different EPTP. 1123 * 1124 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1125 * with different VPID (L1 entries are tagged with vmx->vpid 1126 * while L2 entries are tagged with vmx->nested.vpid02). 1127 */ 1128 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1129 { 1130 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1131 1132 return enable_ept || 1133 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1134 } 1135 1136 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1137 struct vmcs12 *vmcs12, 1138 bool is_vmenter) 1139 { 1140 struct vcpu_vmx *vmx = to_vmx(vcpu); 1141 1142 /* 1143 * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or 1144 * L2's VP_ID upon request from the guest. Make sure we check for 1145 * pending entries in the right FIFO upon L1/L2 transition as these 1146 * requests are put by other vCPUs asynchronously. 1147 */ 1148 if (to_hv_vcpu(vcpu) && enable_ept) 1149 kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); 1150 1151 /* 1152 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1153 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1154 * full TLB flush from the guest's perspective. This is required even 1155 * if VPID is disabled in the host as KVM may need to synchronize the 1156 * MMU in response to the guest TLB flush. 1157 * 1158 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1159 * EPT is a special snowflake, as guest-physical mappings aren't 1160 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1161 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1162 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1163 * those mappings. 1164 */ 1165 if (!nested_cpu_has_vpid(vmcs12)) { 1166 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1167 return; 1168 } 1169 1170 /* L2 should never have a VPID if VPID is disabled. */ 1171 WARN_ON(!enable_vpid); 1172 1173 /* 1174 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1175 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1176 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1177 * that the new vpid12 has never been used and thus represents a new 1178 * guest ASID that cannot have entries in the TLB. 1179 */ 1180 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1181 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1182 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1183 return; 1184 } 1185 1186 /* 1187 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1188 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1189 * KVM was unable to allocate a VPID for L2, flush the current context 1190 * as the effective ASID is common to both L1 and L2. 1191 */ 1192 if (!nested_has_guest_tlb_tag(vcpu)) 1193 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1194 } 1195 1196 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1197 { 1198 superset &= mask; 1199 subset &= mask; 1200 1201 return (superset | subset) == superset; 1202 } 1203 1204 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1205 { 1206 const u64 feature_and_reserved = 1207 /* feature (except bit 48; see below) */ 1208 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1209 /* reserved */ 1210 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1211 u64 vmx_basic = vmcs_config.nested.basic; 1212 1213 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1214 return -EINVAL; 1215 1216 /* 1217 * KVM does not emulate a version of VMX that constrains physical 1218 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1219 */ 1220 if (data & BIT_ULL(48)) 1221 return -EINVAL; 1222 1223 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1224 vmx_basic_vmcs_revision_id(data)) 1225 return -EINVAL; 1226 1227 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1228 return -EINVAL; 1229 1230 vmx->nested.msrs.basic = data; 1231 return 0; 1232 } 1233 1234 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1235 u32 **low, u32 **high) 1236 { 1237 switch (msr_index) { 1238 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1239 *low = &msrs->pinbased_ctls_low; 1240 *high = &msrs->pinbased_ctls_high; 1241 break; 1242 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1243 *low = &msrs->procbased_ctls_low; 1244 *high = &msrs->procbased_ctls_high; 1245 break; 1246 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1247 *low = &msrs->exit_ctls_low; 1248 *high = &msrs->exit_ctls_high; 1249 break; 1250 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1251 *low = &msrs->entry_ctls_low; 1252 *high = &msrs->entry_ctls_high; 1253 break; 1254 case MSR_IA32_VMX_PROCBASED_CTLS2: 1255 *low = &msrs->secondary_ctls_low; 1256 *high = &msrs->secondary_ctls_high; 1257 break; 1258 default: 1259 BUG(); 1260 } 1261 } 1262 1263 static int 1264 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1265 { 1266 u32 *lowp, *highp; 1267 u64 supported; 1268 1269 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1270 1271 supported = vmx_control_msr(*lowp, *highp); 1272 1273 /* Check must-be-1 bits are still 1. */ 1274 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1275 return -EINVAL; 1276 1277 /* Check must-be-0 bits are still 0. */ 1278 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1279 return -EINVAL; 1280 1281 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1282 *lowp = data; 1283 *highp = data >> 32; 1284 return 0; 1285 } 1286 1287 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1288 { 1289 const u64 feature_and_reserved_bits = 1290 /* feature */ 1291 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1292 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1293 /* reserved */ 1294 GENMASK_ULL(13, 9) | BIT_ULL(31); 1295 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1296 vmcs_config.nested.misc_high); 1297 1298 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1299 return -EINVAL; 1300 1301 if ((vmx->nested.msrs.pinbased_ctls_high & 1302 PIN_BASED_VMX_PREEMPTION_TIMER) && 1303 vmx_misc_preemption_timer_rate(data) != 1304 vmx_misc_preemption_timer_rate(vmx_misc)) 1305 return -EINVAL; 1306 1307 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1308 return -EINVAL; 1309 1310 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1311 return -EINVAL; 1312 1313 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1314 return -EINVAL; 1315 1316 vmx->nested.msrs.misc_low = data; 1317 vmx->nested.msrs.misc_high = data >> 32; 1318 1319 return 0; 1320 } 1321 1322 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1323 { 1324 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1325 vmcs_config.nested.vpid_caps); 1326 1327 /* Every bit is either reserved or a feature bit. */ 1328 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1329 return -EINVAL; 1330 1331 vmx->nested.msrs.ept_caps = data; 1332 vmx->nested.msrs.vpid_caps = data >> 32; 1333 return 0; 1334 } 1335 1336 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1337 { 1338 switch (msr_index) { 1339 case MSR_IA32_VMX_CR0_FIXED0: 1340 return &msrs->cr0_fixed0; 1341 case MSR_IA32_VMX_CR4_FIXED0: 1342 return &msrs->cr4_fixed0; 1343 default: 1344 BUG(); 1345 } 1346 } 1347 1348 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1349 { 1350 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1351 1352 /* 1353 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1354 * must be 1 in the restored value. 1355 */ 1356 if (!is_bitwise_subset(data, *msr, -1ULL)) 1357 return -EINVAL; 1358 1359 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1360 return 0; 1361 } 1362 1363 /* 1364 * Called when userspace is restoring VMX MSRs. 1365 * 1366 * Returns 0 on success, non-0 otherwise. 1367 */ 1368 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1369 { 1370 struct vcpu_vmx *vmx = to_vmx(vcpu); 1371 1372 /* 1373 * Don't allow changes to the VMX capability MSRs while the vCPU 1374 * is in VMX operation. 1375 */ 1376 if (vmx->nested.vmxon) 1377 return -EBUSY; 1378 1379 switch (msr_index) { 1380 case MSR_IA32_VMX_BASIC: 1381 return vmx_restore_vmx_basic(vmx, data); 1382 case MSR_IA32_VMX_PINBASED_CTLS: 1383 case MSR_IA32_VMX_PROCBASED_CTLS: 1384 case MSR_IA32_VMX_EXIT_CTLS: 1385 case MSR_IA32_VMX_ENTRY_CTLS: 1386 /* 1387 * The "non-true" VMX capability MSRs are generated from the 1388 * "true" MSRs, so we do not support restoring them directly. 1389 * 1390 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1391 * should restore the "true" MSRs with the must-be-1 bits 1392 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1393 * DEFAULT SETTINGS". 1394 */ 1395 return -EINVAL; 1396 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1397 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1398 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1399 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1400 case MSR_IA32_VMX_PROCBASED_CTLS2: 1401 return vmx_restore_control_msr(vmx, msr_index, data); 1402 case MSR_IA32_VMX_MISC: 1403 return vmx_restore_vmx_misc(vmx, data); 1404 case MSR_IA32_VMX_CR0_FIXED0: 1405 case MSR_IA32_VMX_CR4_FIXED0: 1406 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1407 case MSR_IA32_VMX_CR0_FIXED1: 1408 case MSR_IA32_VMX_CR4_FIXED1: 1409 /* 1410 * These MSRs are generated based on the vCPU's CPUID, so we 1411 * do not support restoring them directly. 1412 */ 1413 return -EINVAL; 1414 case MSR_IA32_VMX_EPT_VPID_CAP: 1415 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1416 case MSR_IA32_VMX_VMCS_ENUM: 1417 vmx->nested.msrs.vmcs_enum = data; 1418 return 0; 1419 case MSR_IA32_VMX_VMFUNC: 1420 if (data & ~vmcs_config.nested.vmfunc_controls) 1421 return -EINVAL; 1422 vmx->nested.msrs.vmfunc_controls = data; 1423 return 0; 1424 default: 1425 /* 1426 * The rest of the VMX capability MSRs do not support restore. 1427 */ 1428 return -EINVAL; 1429 } 1430 } 1431 1432 /* Returns 0 on success, non-0 otherwise. */ 1433 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1434 { 1435 switch (msr_index) { 1436 case MSR_IA32_VMX_BASIC: 1437 *pdata = msrs->basic; 1438 break; 1439 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1440 case MSR_IA32_VMX_PINBASED_CTLS: 1441 *pdata = vmx_control_msr( 1442 msrs->pinbased_ctls_low, 1443 msrs->pinbased_ctls_high); 1444 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1445 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1446 break; 1447 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1448 case MSR_IA32_VMX_PROCBASED_CTLS: 1449 *pdata = vmx_control_msr( 1450 msrs->procbased_ctls_low, 1451 msrs->procbased_ctls_high); 1452 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1453 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1454 break; 1455 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1456 case MSR_IA32_VMX_EXIT_CTLS: 1457 *pdata = vmx_control_msr( 1458 msrs->exit_ctls_low, 1459 msrs->exit_ctls_high); 1460 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1461 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1462 break; 1463 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1464 case MSR_IA32_VMX_ENTRY_CTLS: 1465 *pdata = vmx_control_msr( 1466 msrs->entry_ctls_low, 1467 msrs->entry_ctls_high); 1468 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1469 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1470 break; 1471 case MSR_IA32_VMX_MISC: 1472 *pdata = vmx_control_msr( 1473 msrs->misc_low, 1474 msrs->misc_high); 1475 break; 1476 case MSR_IA32_VMX_CR0_FIXED0: 1477 *pdata = msrs->cr0_fixed0; 1478 break; 1479 case MSR_IA32_VMX_CR0_FIXED1: 1480 *pdata = msrs->cr0_fixed1; 1481 break; 1482 case MSR_IA32_VMX_CR4_FIXED0: 1483 *pdata = msrs->cr4_fixed0; 1484 break; 1485 case MSR_IA32_VMX_CR4_FIXED1: 1486 *pdata = msrs->cr4_fixed1; 1487 break; 1488 case MSR_IA32_VMX_VMCS_ENUM: 1489 *pdata = msrs->vmcs_enum; 1490 break; 1491 case MSR_IA32_VMX_PROCBASED_CTLS2: 1492 *pdata = vmx_control_msr( 1493 msrs->secondary_ctls_low, 1494 msrs->secondary_ctls_high); 1495 break; 1496 case MSR_IA32_VMX_EPT_VPID_CAP: 1497 *pdata = msrs->ept_caps | 1498 ((u64)msrs->vpid_caps << 32); 1499 break; 1500 case MSR_IA32_VMX_VMFUNC: 1501 *pdata = msrs->vmfunc_controls; 1502 break; 1503 default: 1504 return 1; 1505 } 1506 1507 return 0; 1508 } 1509 1510 /* 1511 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1512 * been modified by the L1 guest. Note, "writable" in this context means 1513 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1514 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1515 * VM-exit information fields (which are actually writable if the vCPU is 1516 * configured to support "VMWRITE to any supported field in the VMCS"). 1517 */ 1518 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1519 { 1520 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1521 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1522 struct shadow_vmcs_field field; 1523 unsigned long val; 1524 int i; 1525 1526 if (WARN_ON(!shadow_vmcs)) 1527 return; 1528 1529 preempt_disable(); 1530 1531 vmcs_load(shadow_vmcs); 1532 1533 for (i = 0; i < max_shadow_read_write_fields; i++) { 1534 field = shadow_read_write_fields[i]; 1535 val = __vmcs_readl(field.encoding); 1536 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1537 } 1538 1539 vmcs_clear(shadow_vmcs); 1540 vmcs_load(vmx->loaded_vmcs->vmcs); 1541 1542 preempt_enable(); 1543 } 1544 1545 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1546 { 1547 const struct shadow_vmcs_field *fields[] = { 1548 shadow_read_write_fields, 1549 shadow_read_only_fields 1550 }; 1551 const int max_fields[] = { 1552 max_shadow_read_write_fields, 1553 max_shadow_read_only_fields 1554 }; 1555 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1556 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1557 struct shadow_vmcs_field field; 1558 unsigned long val; 1559 int i, q; 1560 1561 if (WARN_ON(!shadow_vmcs)) 1562 return; 1563 1564 vmcs_load(shadow_vmcs); 1565 1566 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1567 for (i = 0; i < max_fields[q]; i++) { 1568 field = fields[q][i]; 1569 val = vmcs12_read_any(vmcs12, field.encoding, 1570 field.offset); 1571 __vmcs_writel(field.encoding, val); 1572 } 1573 } 1574 1575 vmcs_clear(shadow_vmcs); 1576 vmcs_load(vmx->loaded_vmcs->vmcs); 1577 } 1578 1579 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1580 { 1581 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1582 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1583 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1584 1585 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1586 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1587 vmcs12->guest_rip = evmcs->guest_rip; 1588 1589 if (unlikely(!(hv_clean_fields & 1590 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1591 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1592 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1593 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1594 } 1595 1596 if (unlikely(!(hv_clean_fields & 1597 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1598 vmcs12->guest_rsp = evmcs->guest_rsp; 1599 vmcs12->guest_rflags = evmcs->guest_rflags; 1600 vmcs12->guest_interruptibility_info = 1601 evmcs->guest_interruptibility_info; 1602 /* 1603 * Not present in struct vmcs12: 1604 * vmcs12->guest_ssp = evmcs->guest_ssp; 1605 */ 1606 } 1607 1608 if (unlikely(!(hv_clean_fields & 1609 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1610 vmcs12->cpu_based_vm_exec_control = 1611 evmcs->cpu_based_vm_exec_control; 1612 } 1613 1614 if (unlikely(!(hv_clean_fields & 1615 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1616 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1617 } 1618 1619 if (unlikely(!(hv_clean_fields & 1620 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1621 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1622 } 1623 1624 if (unlikely(!(hv_clean_fields & 1625 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1626 vmcs12->vm_entry_intr_info_field = 1627 evmcs->vm_entry_intr_info_field; 1628 vmcs12->vm_entry_exception_error_code = 1629 evmcs->vm_entry_exception_error_code; 1630 vmcs12->vm_entry_instruction_len = 1631 evmcs->vm_entry_instruction_len; 1632 } 1633 1634 if (unlikely(!(hv_clean_fields & 1635 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1636 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1637 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1638 vmcs12->host_cr0 = evmcs->host_cr0; 1639 vmcs12->host_cr3 = evmcs->host_cr3; 1640 vmcs12->host_cr4 = evmcs->host_cr4; 1641 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1642 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1643 vmcs12->host_rip = evmcs->host_rip; 1644 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1645 vmcs12->host_es_selector = evmcs->host_es_selector; 1646 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1647 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1648 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1649 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1650 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1651 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1652 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1653 /* 1654 * Not present in struct vmcs12: 1655 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1656 * vmcs12->host_ssp = evmcs->host_ssp; 1657 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1658 */ 1659 } 1660 1661 if (unlikely(!(hv_clean_fields & 1662 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1663 vmcs12->pin_based_vm_exec_control = 1664 evmcs->pin_based_vm_exec_control; 1665 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1666 vmcs12->secondary_vm_exec_control = 1667 evmcs->secondary_vm_exec_control; 1668 } 1669 1670 if (unlikely(!(hv_clean_fields & 1671 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1672 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1673 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1674 } 1675 1676 if (unlikely(!(hv_clean_fields & 1677 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1678 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1679 } 1680 1681 if (unlikely(!(hv_clean_fields & 1682 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1683 vmcs12->guest_es_base = evmcs->guest_es_base; 1684 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1685 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1686 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1687 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1688 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1689 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1690 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1691 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1692 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1693 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1694 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1695 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1696 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1697 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1698 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1699 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1700 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1701 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1702 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1703 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1704 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1705 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1706 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1707 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1708 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1709 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1710 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1711 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1712 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1713 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1714 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1715 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1716 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1717 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1718 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1719 } 1720 1721 if (unlikely(!(hv_clean_fields & 1722 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1723 vmcs12->tsc_offset = evmcs->tsc_offset; 1724 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1725 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1726 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1727 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1728 } 1729 1730 if (unlikely(!(hv_clean_fields & 1731 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1732 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1733 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1734 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1735 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1736 vmcs12->guest_cr0 = evmcs->guest_cr0; 1737 vmcs12->guest_cr3 = evmcs->guest_cr3; 1738 vmcs12->guest_cr4 = evmcs->guest_cr4; 1739 vmcs12->guest_dr7 = evmcs->guest_dr7; 1740 } 1741 1742 if (unlikely(!(hv_clean_fields & 1743 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1744 vmcs12->host_fs_base = evmcs->host_fs_base; 1745 vmcs12->host_gs_base = evmcs->host_gs_base; 1746 vmcs12->host_tr_base = evmcs->host_tr_base; 1747 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1748 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1749 vmcs12->host_rsp = evmcs->host_rsp; 1750 } 1751 1752 if (unlikely(!(hv_clean_fields & 1753 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1754 vmcs12->ept_pointer = evmcs->ept_pointer; 1755 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1756 } 1757 1758 if (unlikely(!(hv_clean_fields & 1759 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1760 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1761 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1762 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1763 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1764 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1765 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1766 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1767 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1768 vmcs12->guest_pending_dbg_exceptions = 1769 evmcs->guest_pending_dbg_exceptions; 1770 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1771 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1772 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1773 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1774 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1775 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1776 /* 1777 * Not present in struct vmcs12: 1778 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1779 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1780 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1781 */ 1782 } 1783 1784 /* 1785 * Not used? 1786 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1787 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1788 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1789 * vmcs12->page_fault_error_code_mask = 1790 * evmcs->page_fault_error_code_mask; 1791 * vmcs12->page_fault_error_code_match = 1792 * evmcs->page_fault_error_code_match; 1793 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1794 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1795 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1796 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1797 */ 1798 1799 /* 1800 * Read only fields: 1801 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1802 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1803 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1804 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1805 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1806 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1807 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1808 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1809 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1810 * vmcs12->exit_qualification = evmcs->exit_qualification; 1811 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1812 * 1813 * Not present in struct vmcs12: 1814 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1815 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1816 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1817 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1818 */ 1819 1820 return; 1821 } 1822 1823 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1824 { 1825 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1826 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1827 1828 /* 1829 * Should not be changed by KVM: 1830 * 1831 * evmcs->host_es_selector = vmcs12->host_es_selector; 1832 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1833 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1834 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1835 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1836 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1837 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1838 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1839 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1840 * evmcs->host_cr0 = vmcs12->host_cr0; 1841 * evmcs->host_cr3 = vmcs12->host_cr3; 1842 * evmcs->host_cr4 = vmcs12->host_cr4; 1843 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1844 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1845 * evmcs->host_rip = vmcs12->host_rip; 1846 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1847 * evmcs->host_fs_base = vmcs12->host_fs_base; 1848 * evmcs->host_gs_base = vmcs12->host_gs_base; 1849 * evmcs->host_tr_base = vmcs12->host_tr_base; 1850 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1851 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1852 * evmcs->host_rsp = vmcs12->host_rsp; 1853 * sync_vmcs02_to_vmcs12() doesn't read these: 1854 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1855 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1856 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1857 * evmcs->ept_pointer = vmcs12->ept_pointer; 1858 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1859 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1860 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1861 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1862 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1863 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1864 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1865 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1866 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1867 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1868 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1869 * evmcs->page_fault_error_code_mask = 1870 * vmcs12->page_fault_error_code_mask; 1871 * evmcs->page_fault_error_code_match = 1872 * vmcs12->page_fault_error_code_match; 1873 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1874 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1875 * evmcs->tsc_offset = vmcs12->tsc_offset; 1876 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1877 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1878 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1879 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1880 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1881 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1882 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1883 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1884 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1885 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1886 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1887 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1888 * 1889 * Not present in struct vmcs12: 1890 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1891 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1892 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1893 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1894 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1895 * evmcs->host_ssp = vmcs12->host_ssp; 1896 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1897 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1898 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1899 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1900 * evmcs->guest_ssp = vmcs12->guest_ssp; 1901 */ 1902 1903 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1904 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1905 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1906 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1907 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1908 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1909 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1910 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1911 1912 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1913 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1914 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1915 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1916 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1917 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1918 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1919 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1920 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1921 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1922 1923 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1924 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1925 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1926 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1927 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1928 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1929 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1930 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1931 1932 evmcs->guest_es_base = vmcs12->guest_es_base; 1933 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1934 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1935 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1936 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1937 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1938 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1939 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1940 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1941 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1942 1943 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1944 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1945 1946 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1947 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1948 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1949 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1950 1951 evmcs->guest_pending_dbg_exceptions = 1952 vmcs12->guest_pending_dbg_exceptions; 1953 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1954 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1955 1956 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1957 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1958 1959 evmcs->guest_cr0 = vmcs12->guest_cr0; 1960 evmcs->guest_cr3 = vmcs12->guest_cr3; 1961 evmcs->guest_cr4 = vmcs12->guest_cr4; 1962 evmcs->guest_dr7 = vmcs12->guest_dr7; 1963 1964 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1965 1966 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1967 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1968 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1969 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1970 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1971 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1972 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1973 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1974 1975 evmcs->exit_qualification = vmcs12->exit_qualification; 1976 1977 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1978 evmcs->guest_rsp = vmcs12->guest_rsp; 1979 evmcs->guest_rflags = vmcs12->guest_rflags; 1980 1981 evmcs->guest_interruptibility_info = 1982 vmcs12->guest_interruptibility_info; 1983 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1984 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1985 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1986 evmcs->vm_entry_exception_error_code = 1987 vmcs12->vm_entry_exception_error_code; 1988 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1989 1990 evmcs->guest_rip = vmcs12->guest_rip; 1991 1992 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1993 1994 return; 1995 } 1996 1997 /* 1998 * This is an equivalent of the nested hypervisor executing the vmptrld 1999 * instruction. 2000 */ 2001 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2002 struct kvm_vcpu *vcpu, bool from_launch) 2003 { 2004 struct vcpu_vmx *vmx = to_vmx(vcpu); 2005 bool evmcs_gpa_changed = false; 2006 u64 evmcs_gpa; 2007 2008 if (likely(!guest_cpuid_has_evmcs(vcpu))) 2009 return EVMPTRLD_DISABLED; 2010 2011 evmcs_gpa = nested_get_evmptr(vcpu); 2012 if (!evmptr_is_valid(evmcs_gpa)) { 2013 nested_release_evmcs(vcpu); 2014 return EVMPTRLD_DISABLED; 2015 } 2016 2017 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2018 vmx->nested.current_vmptr = INVALID_GPA; 2019 2020 nested_release_evmcs(vcpu); 2021 2022 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2023 &vmx->nested.hv_evmcs_map)) 2024 return EVMPTRLD_ERROR; 2025 2026 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2027 2028 /* 2029 * Currently, KVM only supports eVMCS version 1 2030 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2031 * value to first u32 field of eVMCS which should specify eVMCS 2032 * VersionNumber. 2033 * 2034 * Guest should be aware of supported eVMCS versions by host by 2035 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2036 * expected to set this CPUID leaf according to the value 2037 * returned in vmcs_version from nested_enable_evmcs(). 2038 * 2039 * However, it turns out that Microsoft Hyper-V fails to comply 2040 * to their own invented interface: When Hyper-V use eVMCS, it 2041 * just sets first u32 field of eVMCS to revision_id specified 2042 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2043 * which is one of the supported versions specified in 2044 * CPUID.0x4000000A.EAX[0:15]. 2045 * 2046 * To overcome Hyper-V bug, we accept here either a supported 2047 * eVMCS version or VMCS12 revision_id as valid values for first 2048 * u32 field of eVMCS. 2049 */ 2050 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2051 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2052 nested_release_evmcs(vcpu); 2053 return EVMPTRLD_VMFAIL; 2054 } 2055 2056 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2057 2058 evmcs_gpa_changed = true; 2059 /* 2060 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2061 * reloaded from guest's memory (read only fields, fields not 2062 * present in struct hv_enlightened_vmcs, ...). Make sure there 2063 * are no leftovers. 2064 */ 2065 if (from_launch) { 2066 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2067 memset(vmcs12, 0, sizeof(*vmcs12)); 2068 vmcs12->hdr.revision_id = VMCS12_REVISION; 2069 } 2070 2071 } 2072 2073 /* 2074 * Clean fields data can't be used on VMLAUNCH and when we switch 2075 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2076 */ 2077 if (from_launch || evmcs_gpa_changed) { 2078 vmx->nested.hv_evmcs->hv_clean_fields &= 2079 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2080 2081 vmx->nested.force_msr_bitmap_recalc = true; 2082 } 2083 2084 return EVMPTRLD_SUCCEEDED; 2085 } 2086 2087 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2088 { 2089 struct vcpu_vmx *vmx = to_vmx(vcpu); 2090 2091 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2092 copy_vmcs12_to_enlightened(vmx); 2093 else 2094 copy_vmcs12_to_shadow(vmx); 2095 2096 vmx->nested.need_vmcs12_to_shadow_sync = false; 2097 } 2098 2099 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2100 { 2101 struct vcpu_vmx *vmx = 2102 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2103 2104 vmx->nested.preemption_timer_expired = true; 2105 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2106 kvm_vcpu_kick(&vmx->vcpu); 2107 2108 return HRTIMER_NORESTART; 2109 } 2110 2111 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2112 { 2113 struct vcpu_vmx *vmx = to_vmx(vcpu); 2114 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2115 2116 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2117 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2118 2119 if (!vmx->nested.has_preemption_timer_deadline) { 2120 vmx->nested.preemption_timer_deadline = 2121 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2122 vmx->nested.has_preemption_timer_deadline = true; 2123 } 2124 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2125 } 2126 2127 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2128 u64 preemption_timeout) 2129 { 2130 struct vcpu_vmx *vmx = to_vmx(vcpu); 2131 2132 /* 2133 * A timer value of zero is architecturally guaranteed to cause 2134 * a VMExit prior to executing any instructions in the guest. 2135 */ 2136 if (preemption_timeout == 0) { 2137 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2138 return; 2139 } 2140 2141 if (vcpu->arch.virtual_tsc_khz == 0) 2142 return; 2143 2144 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2145 preemption_timeout *= 1000000; 2146 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2147 hrtimer_start(&vmx->nested.preemption_timer, 2148 ktime_add_ns(ktime_get(), preemption_timeout), 2149 HRTIMER_MODE_ABS_PINNED); 2150 } 2151 2152 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2153 { 2154 if (vmx->nested.nested_run_pending && 2155 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2156 return vmcs12->guest_ia32_efer; 2157 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2158 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2159 else 2160 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2161 } 2162 2163 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2164 { 2165 struct kvm *kvm = vmx->vcpu.kvm; 2166 2167 /* 2168 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2169 * according to L0's settings (vmcs12 is irrelevant here). Host 2170 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2171 * will be set as needed prior to VMLAUNCH/VMRESUME. 2172 */ 2173 if (vmx->nested.vmcs02_initialized) 2174 return; 2175 vmx->nested.vmcs02_initialized = true; 2176 2177 /* 2178 * We don't care what the EPTP value is we just need to guarantee 2179 * it's valid so we don't get a false positive when doing early 2180 * consistency checks. 2181 */ 2182 if (enable_ept && nested_early_check) 2183 vmcs_write64(EPT_POINTER, 2184 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2185 2186 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2187 if (cpu_has_vmx_vmfunc()) 2188 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2189 2190 if (cpu_has_vmx_posted_intr()) 2191 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2192 2193 if (cpu_has_vmx_msr_bitmap()) 2194 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2195 2196 /* 2197 * PML is emulated for L2, but never enabled in hardware as the MMU 2198 * handles A/D emulation. Disabling PML for L2 also avoids having to 2199 * deal with filtering out L2 GPAs from the buffer. 2200 */ 2201 if (enable_pml) { 2202 vmcs_write64(PML_ADDRESS, 0); 2203 vmcs_write16(GUEST_PML_INDEX, -1); 2204 } 2205 2206 if (cpu_has_vmx_encls_vmexit()) 2207 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2208 2209 if (kvm_notify_vmexit_enabled(kvm)) 2210 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2211 2212 /* 2213 * Set the MSR load/store lists to match L0's settings. Only the 2214 * addresses are constant (for vmcs02), the counts can change based 2215 * on L2's behavior, e.g. switching to/from long mode. 2216 */ 2217 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2218 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2219 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2220 2221 vmx_set_constant_host_state(vmx); 2222 } 2223 2224 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2225 struct vmcs12 *vmcs12) 2226 { 2227 prepare_vmcs02_constant_state(vmx); 2228 2229 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2230 2231 if (enable_vpid) { 2232 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2233 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2234 else 2235 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2236 } 2237 } 2238 2239 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2240 struct vmcs12 *vmcs12) 2241 { 2242 u32 exec_control; 2243 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2244 2245 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2246 prepare_vmcs02_early_rare(vmx, vmcs12); 2247 2248 /* 2249 * PIN CONTROLS 2250 */ 2251 exec_control = __pin_controls_get(vmcs01); 2252 exec_control |= (vmcs12->pin_based_vm_exec_control & 2253 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2254 2255 /* Posted interrupts setting is only taken from vmcs12. */ 2256 vmx->nested.pi_pending = false; 2257 if (nested_cpu_has_posted_intr(vmcs12)) 2258 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2259 else 2260 exec_control &= ~PIN_BASED_POSTED_INTR; 2261 pin_controls_set(vmx, exec_control); 2262 2263 /* 2264 * EXEC CONTROLS 2265 */ 2266 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2267 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2268 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2269 exec_control &= ~CPU_BASED_TPR_SHADOW; 2270 exec_control |= vmcs12->cpu_based_vm_exec_control; 2271 2272 vmx->nested.l1_tpr_threshold = -1; 2273 if (exec_control & CPU_BASED_TPR_SHADOW) 2274 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2275 #ifdef CONFIG_X86_64 2276 else 2277 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2278 CPU_BASED_CR8_STORE_EXITING; 2279 #endif 2280 2281 /* 2282 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2283 * for I/O port accesses. 2284 */ 2285 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2286 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2287 2288 /* 2289 * This bit will be computed in nested_get_vmcs12_pages, because 2290 * we do not have access to L1's MSR bitmap yet. For now, keep 2291 * the same bit as before, hoping to avoid multiple VMWRITEs that 2292 * only set/clear this bit. 2293 */ 2294 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2295 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2296 2297 exec_controls_set(vmx, exec_control); 2298 2299 /* 2300 * SECONDARY EXEC CONTROLS 2301 */ 2302 if (cpu_has_secondary_exec_ctrls()) { 2303 exec_control = __secondary_exec_controls_get(vmcs01); 2304 2305 /* Take the following fields only from vmcs12 */ 2306 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2307 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2308 SECONDARY_EXEC_ENABLE_INVPCID | 2309 SECONDARY_EXEC_ENABLE_RDTSCP | 2310 SECONDARY_EXEC_ENABLE_XSAVES | 2311 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2312 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2313 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2314 SECONDARY_EXEC_ENABLE_VMFUNC | 2315 SECONDARY_EXEC_DESC); 2316 2317 if (nested_cpu_has(vmcs12, 2318 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2319 exec_control |= vmcs12->secondary_vm_exec_control; 2320 2321 /* PML is emulated and never enabled in hardware for L2. */ 2322 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2323 2324 /* VMCS shadowing for L2 is emulated for now */ 2325 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2326 2327 /* 2328 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2329 * will not have to rewrite the controls just for this bit. 2330 */ 2331 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2332 exec_control |= SECONDARY_EXEC_DESC; 2333 2334 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2335 vmcs_write16(GUEST_INTR_STATUS, 2336 vmcs12->guest_intr_status); 2337 2338 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2339 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2340 2341 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2342 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2343 2344 secondary_exec_controls_set(vmx, exec_control); 2345 } 2346 2347 /* 2348 * ENTRY CONTROLS 2349 * 2350 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2351 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2352 * on the related bits (if supported by the CPU) in the hope that 2353 * we can avoid VMWrites during vmx_set_efer(). 2354 * 2355 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2356 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2357 * do the same for L2. 2358 */ 2359 exec_control = __vm_entry_controls_get(vmcs01); 2360 exec_control |= (vmcs12->vm_entry_controls & 2361 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2362 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2363 if (cpu_has_load_ia32_efer()) { 2364 if (guest_efer & EFER_LMA) 2365 exec_control |= VM_ENTRY_IA32E_MODE; 2366 if (guest_efer != host_efer) 2367 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2368 } 2369 vm_entry_controls_set(vmx, exec_control); 2370 2371 /* 2372 * EXIT CONTROLS 2373 * 2374 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2375 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2376 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2377 */ 2378 exec_control = __vm_exit_controls_get(vmcs01); 2379 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2380 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2381 else 2382 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2383 vm_exit_controls_set(vmx, exec_control); 2384 2385 /* 2386 * Interrupt/Exception Fields 2387 */ 2388 if (vmx->nested.nested_run_pending) { 2389 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2390 vmcs12->vm_entry_intr_info_field); 2391 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2392 vmcs12->vm_entry_exception_error_code); 2393 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2394 vmcs12->vm_entry_instruction_len); 2395 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2396 vmcs12->guest_interruptibility_info); 2397 vmx->loaded_vmcs->nmi_known_unmasked = 2398 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2399 } else { 2400 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2401 } 2402 } 2403 2404 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2405 { 2406 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2407 2408 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2409 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2410 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2411 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2412 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2413 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2414 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2415 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2416 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2417 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2418 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2419 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2420 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2421 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2422 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2423 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2424 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2425 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2426 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2427 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2428 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2429 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2430 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2431 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2432 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2433 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2434 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2435 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2436 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2437 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2438 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2439 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2440 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2441 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2442 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2443 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2444 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2445 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2446 2447 vmx->segment_cache.bitmask = 0; 2448 } 2449 2450 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2451 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2452 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2453 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2454 vmcs12->guest_pending_dbg_exceptions); 2455 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2456 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2457 2458 /* 2459 * L1 may access the L2's PDPTR, so save them to construct 2460 * vmcs12 2461 */ 2462 if (enable_ept) { 2463 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2464 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2465 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2466 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2467 } 2468 2469 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2470 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2471 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2472 } 2473 2474 if (nested_cpu_has_xsaves(vmcs12)) 2475 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2476 2477 /* 2478 * Whether page-faults are trapped is determined by a combination of 2479 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2480 * doesn't care about page faults then we should set all of these to 2481 * L1's desires. However, if L0 does care about (some) page faults, it 2482 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2483 * simply ask to exit on each and every L2 page fault. This is done by 2484 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2485 * Note that below we don't need special code to set EB.PF beyond the 2486 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2487 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2488 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2489 */ 2490 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2491 /* 2492 * TODO: if both L0 and L1 need the same MASK and MATCH, 2493 * go ahead and use it? 2494 */ 2495 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2496 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2497 } else { 2498 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2499 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2500 } 2501 2502 if (cpu_has_vmx_apicv()) { 2503 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2504 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2505 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2506 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2507 } 2508 2509 /* 2510 * Make sure the msr_autostore list is up to date before we set the 2511 * count in the vmcs02. 2512 */ 2513 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2514 2515 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2516 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2517 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2518 2519 set_cr4_guest_host_mask(vmx); 2520 } 2521 2522 /* 2523 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2524 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2525 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2526 * guest in a way that will both be appropriate to L1's requests, and our 2527 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2528 * function also has additional necessary side-effects, like setting various 2529 * vcpu->arch fields. 2530 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2531 * is assigned to entry_failure_code on failure. 2532 */ 2533 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2534 bool from_vmentry, 2535 enum vm_entry_failure_code *entry_failure_code) 2536 { 2537 struct vcpu_vmx *vmx = to_vmx(vcpu); 2538 bool load_guest_pdptrs_vmcs12 = false; 2539 2540 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 2541 prepare_vmcs02_rare(vmx, vmcs12); 2542 vmx->nested.dirty_vmcs12 = false; 2543 2544 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || 2545 !(vmx->nested.hv_evmcs->hv_clean_fields & 2546 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2547 } 2548 2549 if (vmx->nested.nested_run_pending && 2550 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2551 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2552 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2553 } else { 2554 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2555 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2556 } 2557 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2558 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2559 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2560 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2561 2562 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2563 * bitwise-or of what L1 wants to trap for L2, and what we want to 2564 * trap. Note that CR0.TS also needs updating - we do this later. 2565 */ 2566 vmx_update_exception_bitmap(vcpu); 2567 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2568 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2569 2570 if (vmx->nested.nested_run_pending && 2571 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2572 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2573 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2574 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2575 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2576 } 2577 2578 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2579 vcpu->arch.l1_tsc_offset, 2580 vmx_get_l2_tsc_offset(vcpu), 2581 vmx_get_l2_tsc_multiplier(vcpu)); 2582 2583 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2584 vcpu->arch.l1_tsc_scaling_ratio, 2585 vmx_get_l2_tsc_multiplier(vcpu)); 2586 2587 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2588 if (kvm_caps.has_tsc_control) 2589 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2590 2591 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2592 2593 if (nested_cpu_has_ept(vmcs12)) 2594 nested_ept_init_mmu_context(vcpu); 2595 2596 /* 2597 * Override the CR0/CR4 read shadows after setting the effective guest 2598 * CR0/CR4. The common helpers also set the shadows, but they don't 2599 * account for vmcs12's cr0/4_guest_host_mask. 2600 */ 2601 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2602 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2603 2604 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2605 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2606 2607 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2608 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2609 vmx_set_efer(vcpu, vcpu->arch.efer); 2610 2611 /* 2612 * Guest state is invalid and unrestricted guest is disabled, 2613 * which means L1 attempted VMEntry to L2 with invalid state. 2614 * Fail the VMEntry. 2615 * 2616 * However when force loading the guest state (SMM exit or 2617 * loading nested state after migration, it is possible to 2618 * have invalid guest state now, which will be later fixed by 2619 * restoring L2 register state 2620 */ 2621 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2622 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2623 return -EINVAL; 2624 } 2625 2626 /* Shadow page tables on either EPT or shadow page tables. */ 2627 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2628 from_vmentry, entry_failure_code)) 2629 return -EINVAL; 2630 2631 /* 2632 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2633 * on nested VM-Exit, which can occur without actually running L2 and 2634 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2635 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2636 * transition to HLT instead of running L2. 2637 */ 2638 if (enable_ept) 2639 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2640 2641 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2642 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2643 is_pae_paging(vcpu)) { 2644 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2645 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2646 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2647 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2648 } 2649 2650 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2651 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2652 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2653 vmcs12->guest_ia32_perf_global_ctrl))) { 2654 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2655 return -EINVAL; 2656 } 2657 2658 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2659 kvm_rip_write(vcpu, vmcs12->guest_rip); 2660 2661 /* 2662 * It was observed that genuine Hyper-V running in L1 doesn't reset 2663 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2664 * bits when it changes a field in eVMCS. Mark all fields as clean 2665 * here. 2666 */ 2667 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2668 vmx->nested.hv_evmcs->hv_clean_fields |= 2669 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2670 2671 return 0; 2672 } 2673 2674 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2675 { 2676 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2677 nested_cpu_has_virtual_nmis(vmcs12))) 2678 return -EINVAL; 2679 2680 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2681 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2682 return -EINVAL; 2683 2684 return 0; 2685 } 2686 2687 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2688 { 2689 struct vcpu_vmx *vmx = to_vmx(vcpu); 2690 2691 /* Check for memory type validity */ 2692 switch (new_eptp & VMX_EPTP_MT_MASK) { 2693 case VMX_EPTP_MT_UC: 2694 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2695 return false; 2696 break; 2697 case VMX_EPTP_MT_WB: 2698 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2699 return false; 2700 break; 2701 default: 2702 return false; 2703 } 2704 2705 /* Page-walk levels validity. */ 2706 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2707 case VMX_EPTP_PWL_5: 2708 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2709 return false; 2710 break; 2711 case VMX_EPTP_PWL_4: 2712 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2713 return false; 2714 break; 2715 default: 2716 return false; 2717 } 2718 2719 /* Reserved bits should not be set */ 2720 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2721 return false; 2722 2723 /* AD, if set, should be supported */ 2724 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2725 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2726 return false; 2727 } 2728 2729 return true; 2730 } 2731 2732 /* 2733 * Checks related to VM-Execution Control Fields 2734 */ 2735 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2736 struct vmcs12 *vmcs12) 2737 { 2738 struct vcpu_vmx *vmx = to_vmx(vcpu); 2739 2740 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2741 vmx->nested.msrs.pinbased_ctls_low, 2742 vmx->nested.msrs.pinbased_ctls_high)) || 2743 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2744 vmx->nested.msrs.procbased_ctls_low, 2745 vmx->nested.msrs.procbased_ctls_high))) 2746 return -EINVAL; 2747 2748 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2749 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2750 vmx->nested.msrs.secondary_ctls_low, 2751 vmx->nested.msrs.secondary_ctls_high))) 2752 return -EINVAL; 2753 2754 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2755 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2756 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2757 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2758 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2759 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2760 nested_vmx_check_nmi_controls(vmcs12) || 2761 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2762 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2763 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2764 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2765 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2766 return -EINVAL; 2767 2768 if (!nested_cpu_has_preemption_timer(vmcs12) && 2769 nested_cpu_has_save_preemption_timer(vmcs12)) 2770 return -EINVAL; 2771 2772 if (nested_cpu_has_ept(vmcs12) && 2773 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2774 return -EINVAL; 2775 2776 if (nested_cpu_has_vmfunc(vmcs12)) { 2777 if (CC(vmcs12->vm_function_control & 2778 ~vmx->nested.msrs.vmfunc_controls)) 2779 return -EINVAL; 2780 2781 if (nested_cpu_has_eptp_switching(vmcs12)) { 2782 if (CC(!nested_cpu_has_ept(vmcs12)) || 2783 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2784 return -EINVAL; 2785 } 2786 } 2787 2788 return 0; 2789 } 2790 2791 /* 2792 * Checks related to VM-Exit Control Fields 2793 */ 2794 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2795 struct vmcs12 *vmcs12) 2796 { 2797 struct vcpu_vmx *vmx = to_vmx(vcpu); 2798 2799 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2800 vmx->nested.msrs.exit_ctls_low, 2801 vmx->nested.msrs.exit_ctls_high)) || 2802 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2803 return -EINVAL; 2804 2805 return 0; 2806 } 2807 2808 /* 2809 * Checks related to VM-Entry Control Fields 2810 */ 2811 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2812 struct vmcs12 *vmcs12) 2813 { 2814 struct vcpu_vmx *vmx = to_vmx(vcpu); 2815 2816 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2817 vmx->nested.msrs.entry_ctls_low, 2818 vmx->nested.msrs.entry_ctls_high))) 2819 return -EINVAL; 2820 2821 /* 2822 * From the Intel SDM, volume 3: 2823 * Fields relevant to VM-entry event injection must be set properly. 2824 * These fields are the VM-entry interruption-information field, the 2825 * VM-entry exception error code, and the VM-entry instruction length. 2826 */ 2827 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2828 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2829 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2830 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2831 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2832 bool should_have_error_code; 2833 bool urg = nested_cpu_has2(vmcs12, 2834 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2835 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2836 2837 /* VM-entry interruption-info field: interruption type */ 2838 if (CC(intr_type == INTR_TYPE_RESERVED) || 2839 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2840 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2841 return -EINVAL; 2842 2843 /* VM-entry interruption-info field: vector */ 2844 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2845 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2846 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2847 return -EINVAL; 2848 2849 /* VM-entry interruption-info field: deliver error code */ 2850 should_have_error_code = 2851 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2852 x86_exception_has_error_code(vector); 2853 if (CC(has_error_code != should_have_error_code)) 2854 return -EINVAL; 2855 2856 /* VM-entry exception error code */ 2857 if (CC(has_error_code && 2858 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2859 return -EINVAL; 2860 2861 /* VM-entry interruption-info field: reserved bits */ 2862 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2863 return -EINVAL; 2864 2865 /* VM-entry instruction length */ 2866 switch (intr_type) { 2867 case INTR_TYPE_SOFT_EXCEPTION: 2868 case INTR_TYPE_SOFT_INTR: 2869 case INTR_TYPE_PRIV_SW_EXCEPTION: 2870 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2871 CC(vmcs12->vm_entry_instruction_len == 0 && 2872 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2873 return -EINVAL; 2874 } 2875 } 2876 2877 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2878 return -EINVAL; 2879 2880 return 0; 2881 } 2882 2883 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2884 struct vmcs12 *vmcs12) 2885 { 2886 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2887 nested_check_vm_exit_controls(vcpu, vmcs12) || 2888 nested_check_vm_entry_controls(vcpu, vmcs12)) 2889 return -EINVAL; 2890 2891 if (guest_cpuid_has_evmcs(vcpu)) 2892 return nested_evmcs_check_controls(vmcs12); 2893 2894 return 0; 2895 } 2896 2897 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2898 struct vmcs12 *vmcs12) 2899 { 2900 #ifdef CONFIG_X86_64 2901 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2902 !!(vcpu->arch.efer & EFER_LMA))) 2903 return -EINVAL; 2904 #endif 2905 return 0; 2906 } 2907 2908 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2909 struct vmcs12 *vmcs12) 2910 { 2911 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2912 2913 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2914 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2915 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2916 return -EINVAL; 2917 2918 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2919 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2920 return -EINVAL; 2921 2922 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2923 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2924 return -EINVAL; 2925 2926 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2927 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2928 vmcs12->host_ia32_perf_global_ctrl))) 2929 return -EINVAL; 2930 2931 if (ia32e) { 2932 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2933 return -EINVAL; 2934 } else { 2935 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2936 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2937 CC((vmcs12->host_rip) >> 32)) 2938 return -EINVAL; 2939 } 2940 2941 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2942 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2943 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2944 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2945 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2946 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2947 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2948 CC(vmcs12->host_cs_selector == 0) || 2949 CC(vmcs12->host_tr_selector == 0) || 2950 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2951 return -EINVAL; 2952 2953 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2954 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2955 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2956 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2957 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2958 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2959 return -EINVAL; 2960 2961 /* 2962 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2963 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2964 * the values of the LMA and LME bits in the field must each be that of 2965 * the host address-space size VM-exit control. 2966 */ 2967 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2968 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2969 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2970 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2971 return -EINVAL; 2972 } 2973 2974 return 0; 2975 } 2976 2977 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2978 struct vmcs12 *vmcs12) 2979 { 2980 struct vcpu_vmx *vmx = to_vmx(vcpu); 2981 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 2982 struct vmcs_hdr hdr; 2983 2984 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 2985 return 0; 2986 2987 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2988 return -EINVAL; 2989 2990 if (ghc->gpa != vmcs12->vmcs_link_pointer && 2991 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 2992 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 2993 return -EINVAL; 2994 2995 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 2996 offsetof(struct vmcs12, hdr), 2997 sizeof(hdr)))) 2998 return -EINVAL; 2999 3000 if (CC(hdr.revision_id != VMCS12_REVISION) || 3001 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3002 return -EINVAL; 3003 3004 return 0; 3005 } 3006 3007 /* 3008 * Checks related to Guest Non-register State 3009 */ 3010 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3011 { 3012 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3013 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3014 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3015 return -EINVAL; 3016 3017 return 0; 3018 } 3019 3020 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3021 struct vmcs12 *vmcs12, 3022 enum vm_entry_failure_code *entry_failure_code) 3023 { 3024 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3025 3026 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3027 3028 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3029 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3030 return -EINVAL; 3031 3032 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3033 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3034 return -EINVAL; 3035 3036 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3037 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3038 return -EINVAL; 3039 3040 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3041 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3042 return -EINVAL; 3043 } 3044 3045 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3046 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3047 vmcs12->guest_ia32_perf_global_ctrl))) 3048 return -EINVAL; 3049 3050 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3051 return -EINVAL; 3052 3053 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3054 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3055 return -EINVAL; 3056 3057 /* 3058 * If the load IA32_EFER VM-entry control is 1, the following checks 3059 * are performed on the field for the IA32_EFER MSR: 3060 * - Bits reserved in the IA32_EFER MSR must be 0. 3061 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3062 * the IA-32e mode guest VM-exit control. It must also be identical 3063 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3064 * CR0.PG) is 1. 3065 */ 3066 if (to_vmx(vcpu)->nested.nested_run_pending && 3067 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3068 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3069 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3070 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3071 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3072 return -EINVAL; 3073 } 3074 3075 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3076 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3077 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3078 return -EINVAL; 3079 3080 if (nested_check_guest_non_reg_state(vmcs12)) 3081 return -EINVAL; 3082 3083 return 0; 3084 } 3085 3086 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3087 { 3088 struct vcpu_vmx *vmx = to_vmx(vcpu); 3089 unsigned long cr3, cr4; 3090 bool vm_fail; 3091 3092 if (!nested_early_check) 3093 return 0; 3094 3095 if (vmx->msr_autoload.host.nr) 3096 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3097 if (vmx->msr_autoload.guest.nr) 3098 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3099 3100 preempt_disable(); 3101 3102 vmx_prepare_switch_to_guest(vcpu); 3103 3104 /* 3105 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3106 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3107 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3108 * there is no need to preserve other bits or save/restore the field. 3109 */ 3110 vmcs_writel(GUEST_RFLAGS, 0); 3111 3112 cr3 = __get_current_cr3_fast(); 3113 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3114 vmcs_writel(HOST_CR3, cr3); 3115 vmx->loaded_vmcs->host_state.cr3 = cr3; 3116 } 3117 3118 cr4 = cr4_read_shadow(); 3119 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3120 vmcs_writel(HOST_CR4, cr4); 3121 vmx->loaded_vmcs->host_state.cr4 = cr4; 3122 } 3123 3124 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3125 __vmx_vcpu_run_flags(vmx)); 3126 3127 if (vmx->msr_autoload.host.nr) 3128 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3129 if (vmx->msr_autoload.guest.nr) 3130 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3131 3132 if (vm_fail) { 3133 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3134 3135 preempt_enable(); 3136 3137 trace_kvm_nested_vmenter_failed( 3138 "early hardware check VM-instruction error: ", error); 3139 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3140 return 1; 3141 } 3142 3143 /* 3144 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3145 */ 3146 if (hw_breakpoint_active()) 3147 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3148 local_irq_enable(); 3149 preempt_enable(); 3150 3151 /* 3152 * A non-failing VMEntry means we somehow entered guest mode with 3153 * an illegal RIP, and that's just the tip of the iceberg. There 3154 * is no telling what memory has been modified or what state has 3155 * been exposed to unknown code. Hitting this all but guarantees 3156 * a (very critical) hardware issue. 3157 */ 3158 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3159 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3160 3161 return 0; 3162 } 3163 3164 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3165 { 3166 struct vcpu_vmx *vmx = to_vmx(vcpu); 3167 3168 /* 3169 * hv_evmcs may end up being not mapped after migration (when 3170 * L2 was running), map it here to make sure vmcs12 changes are 3171 * properly reflected. 3172 */ 3173 if (guest_cpuid_has_evmcs(vcpu) && 3174 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3175 enum nested_evmptrld_status evmptrld_status = 3176 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3177 3178 if (evmptrld_status == EVMPTRLD_VMFAIL || 3179 evmptrld_status == EVMPTRLD_ERROR) 3180 return false; 3181 3182 /* 3183 * Post migration VMCS12 always provides the most actual 3184 * information, copy it to eVMCS upon entry. 3185 */ 3186 vmx->nested.need_vmcs12_to_shadow_sync = true; 3187 } 3188 3189 return true; 3190 } 3191 3192 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3193 { 3194 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3195 struct vcpu_vmx *vmx = to_vmx(vcpu); 3196 struct kvm_host_map *map; 3197 3198 if (!vcpu->arch.pdptrs_from_userspace && 3199 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3200 /* 3201 * Reload the guest's PDPTRs since after a migration 3202 * the guest CR3 might be restored prior to setting the nested 3203 * state which can lead to a load of wrong PDPTRs. 3204 */ 3205 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3206 return false; 3207 } 3208 3209 3210 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3211 map = &vmx->nested.apic_access_page_map; 3212 3213 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3214 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3215 } else { 3216 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3217 __func__); 3218 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3219 vcpu->run->internal.suberror = 3220 KVM_INTERNAL_ERROR_EMULATION; 3221 vcpu->run->internal.ndata = 0; 3222 return false; 3223 } 3224 } 3225 3226 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3227 map = &vmx->nested.virtual_apic_map; 3228 3229 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3230 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3231 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3232 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3233 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3234 /* 3235 * The processor will never use the TPR shadow, simply 3236 * clear the bit from the execution control. Such a 3237 * configuration is useless, but it happens in tests. 3238 * For any other configuration, failing the vm entry is 3239 * _not_ what the processor does but it's basically the 3240 * only possibility we have. 3241 */ 3242 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3243 } else { 3244 /* 3245 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3246 * force VM-Entry to fail. 3247 */ 3248 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3249 } 3250 } 3251 3252 if (nested_cpu_has_posted_intr(vmcs12)) { 3253 map = &vmx->nested.pi_desc_map; 3254 3255 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3256 vmx->nested.pi_desc = 3257 (struct pi_desc *)(((void *)map->hva) + 3258 offset_in_page(vmcs12->posted_intr_desc_addr)); 3259 vmcs_write64(POSTED_INTR_DESC_ADDR, 3260 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3261 } else { 3262 /* 3263 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3264 * access the contents of the VMCS12 posted interrupt 3265 * descriptor. (Note that KVM may do this when it 3266 * should not, per the architectural specification.) 3267 */ 3268 vmx->nested.pi_desc = NULL; 3269 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3270 } 3271 } 3272 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3273 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3274 else 3275 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3276 3277 return true; 3278 } 3279 3280 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3281 { 3282 /* 3283 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3284 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3285 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3286 * migration. 3287 */ 3288 if (!nested_get_evmcs_page(vcpu)) { 3289 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3290 __func__); 3291 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3292 vcpu->run->internal.suberror = 3293 KVM_INTERNAL_ERROR_EMULATION; 3294 vcpu->run->internal.ndata = 0; 3295 3296 return false; 3297 } 3298 3299 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3300 return false; 3301 3302 return true; 3303 } 3304 3305 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3306 { 3307 struct vmcs12 *vmcs12; 3308 struct vcpu_vmx *vmx = to_vmx(vcpu); 3309 gpa_t dst; 3310 3311 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3312 return 0; 3313 3314 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3315 return 1; 3316 3317 /* 3318 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3319 * set is already checked as part of A/D emulation. 3320 */ 3321 vmcs12 = get_vmcs12(vcpu); 3322 if (!nested_cpu_has_pml(vmcs12)) 3323 return 0; 3324 3325 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3326 vmx->nested.pml_full = true; 3327 return 1; 3328 } 3329 3330 gpa &= ~0xFFFull; 3331 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3332 3333 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3334 offset_in_page(dst), sizeof(gpa))) 3335 return 0; 3336 3337 vmcs12->guest_pml_index--; 3338 3339 return 0; 3340 } 3341 3342 /* 3343 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3344 * for running VMX instructions (except VMXON, whose prerequisites are 3345 * slightly different). It also specifies what exception to inject otherwise. 3346 * Note that many of these exceptions have priority over VM exits, so they 3347 * don't have to be checked again here. 3348 */ 3349 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3350 { 3351 if (!to_vmx(vcpu)->nested.vmxon) { 3352 kvm_queue_exception(vcpu, UD_VECTOR); 3353 return 0; 3354 } 3355 3356 if (vmx_get_cpl(vcpu)) { 3357 kvm_inject_gp(vcpu, 0); 3358 return 0; 3359 } 3360 3361 return 1; 3362 } 3363 3364 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3365 { 3366 u8 rvi = vmx_get_rvi(); 3367 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3368 3369 return ((rvi & 0xf0) > (vppr & 0xf0)); 3370 } 3371 3372 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3373 struct vmcs12 *vmcs12); 3374 3375 /* 3376 * If from_vmentry is false, this is being called from state restore (either RSM 3377 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3378 * 3379 * Returns: 3380 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3381 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3382 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3383 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3384 */ 3385 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3386 bool from_vmentry) 3387 { 3388 struct vcpu_vmx *vmx = to_vmx(vcpu); 3389 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3390 enum vm_entry_failure_code entry_failure_code; 3391 bool evaluate_pending_interrupts; 3392 union vmx_exit_reason exit_reason = { 3393 .basic = EXIT_REASON_INVALID_STATE, 3394 .failed_vmentry = 1, 3395 }; 3396 u32 failed_index; 3397 3398 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3399 vmx->nested.current_vmptr, 3400 vmcs12->guest_rip, 3401 vmcs12->guest_intr_status, 3402 vmcs12->vm_entry_intr_info_field, 3403 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3404 vmcs12->ept_pointer, 3405 vmcs12->guest_cr3, 3406 KVM_ISA_VMX); 3407 3408 kvm_service_local_tlb_flush_requests(vcpu); 3409 3410 evaluate_pending_interrupts = exec_controls_get(vmx) & 3411 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3412 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3413 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3414 if (!evaluate_pending_interrupts) 3415 evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); 3416 3417 if (!vmx->nested.nested_run_pending || 3418 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3419 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3420 if (kvm_mpx_supported() && 3421 (!vmx->nested.nested_run_pending || 3422 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3423 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3424 3425 /* 3426 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3427 * nested early checks are disabled. In the event of a "late" VM-Fail, 3428 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3429 * software model to the pre-VMEntry host state. When EPT is disabled, 3430 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3431 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3432 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3433 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3434 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3435 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3436 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3437 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3438 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3439 * path would need to manually save/restore vmcs01.GUEST_CR3. 3440 */ 3441 if (!enable_ept && !nested_early_check) 3442 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3443 3444 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3445 3446 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3447 3448 if (from_vmentry) { 3449 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3450 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3451 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3452 } 3453 3454 if (nested_vmx_check_vmentry_hw(vcpu)) { 3455 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3456 return NVMX_VMENTRY_VMFAIL; 3457 } 3458 3459 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3460 &entry_failure_code)) { 3461 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3462 vmcs12->exit_qualification = entry_failure_code; 3463 goto vmentry_fail_vmexit; 3464 } 3465 } 3466 3467 enter_guest_mode(vcpu); 3468 3469 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3470 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3471 vmcs12->exit_qualification = entry_failure_code; 3472 goto vmentry_fail_vmexit_guest_mode; 3473 } 3474 3475 if (from_vmentry) { 3476 failed_index = nested_vmx_load_msr(vcpu, 3477 vmcs12->vm_entry_msr_load_addr, 3478 vmcs12->vm_entry_msr_load_count); 3479 if (failed_index) { 3480 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3481 vmcs12->exit_qualification = failed_index; 3482 goto vmentry_fail_vmexit_guest_mode; 3483 } 3484 } else { 3485 /* 3486 * The MMU is not initialized to point at the right entities yet and 3487 * "get pages" would need to read data from the guest (i.e. we will 3488 * need to perform gpa to hpa translation). Request a call 3489 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3490 * have already been set at vmentry time and should not be reset. 3491 */ 3492 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3493 } 3494 3495 /* 3496 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3497 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3498 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3499 * unconditionally. 3500 */ 3501 if (unlikely(evaluate_pending_interrupts)) 3502 kvm_make_request(KVM_REQ_EVENT, vcpu); 3503 3504 /* 3505 * Do not start the preemption timer hrtimer until after we know 3506 * we are successful, so that only nested_vmx_vmexit needs to cancel 3507 * the timer. 3508 */ 3509 vmx->nested.preemption_timer_expired = false; 3510 if (nested_cpu_has_preemption_timer(vmcs12)) { 3511 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3512 vmx_start_preemption_timer(vcpu, timer_value); 3513 } 3514 3515 /* 3516 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3517 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3518 * returned as far as L1 is concerned. It will only return (and set 3519 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3520 */ 3521 return NVMX_VMENTRY_SUCCESS; 3522 3523 /* 3524 * A failed consistency check that leads to a VMExit during L1's 3525 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3526 * 26.7 "VM-entry failures during or after loading guest state". 3527 */ 3528 vmentry_fail_vmexit_guest_mode: 3529 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3530 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3531 leave_guest_mode(vcpu); 3532 3533 vmentry_fail_vmexit: 3534 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3535 3536 if (!from_vmentry) 3537 return NVMX_VMENTRY_VMEXIT; 3538 3539 load_vmcs12_host_state(vcpu, vmcs12); 3540 vmcs12->vm_exit_reason = exit_reason.full; 3541 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 3542 vmx->nested.need_vmcs12_to_shadow_sync = true; 3543 return NVMX_VMENTRY_VMEXIT; 3544 } 3545 3546 /* 3547 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3548 * for running an L2 nested guest. 3549 */ 3550 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3551 { 3552 struct vmcs12 *vmcs12; 3553 enum nvmx_vmentry_status status; 3554 struct vcpu_vmx *vmx = to_vmx(vcpu); 3555 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3556 enum nested_evmptrld_status evmptrld_status; 3557 3558 if (!nested_vmx_check_permission(vcpu)) 3559 return 1; 3560 3561 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3562 if (evmptrld_status == EVMPTRLD_ERROR) { 3563 kvm_queue_exception(vcpu, UD_VECTOR); 3564 return 1; 3565 } 3566 3567 kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 3568 3569 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3570 return nested_vmx_failInvalid(vcpu); 3571 3572 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && 3573 vmx->nested.current_vmptr == INVALID_GPA)) 3574 return nested_vmx_failInvalid(vcpu); 3575 3576 vmcs12 = get_vmcs12(vcpu); 3577 3578 /* 3579 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3580 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3581 * rather than RFLAGS.ZF, and no error number is stored to the 3582 * VM-instruction error field. 3583 */ 3584 if (CC(vmcs12->hdr.shadow_vmcs)) 3585 return nested_vmx_failInvalid(vcpu); 3586 3587 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 3588 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); 3589 /* Enlightened VMCS doesn't have launch state */ 3590 vmcs12->launch_state = !launch; 3591 } else if (enable_shadow_vmcs) { 3592 copy_shadow_to_vmcs12(vmx); 3593 } 3594 3595 /* 3596 * The nested entry process starts with enforcing various prerequisites 3597 * on vmcs12 as required by the Intel SDM, and act appropriately when 3598 * they fail: As the SDM explains, some conditions should cause the 3599 * instruction to fail, while others will cause the instruction to seem 3600 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3601 * To speed up the normal (success) code path, we should avoid checking 3602 * for misconfigurations which will anyway be caught by the processor 3603 * when using the merged vmcs02. 3604 */ 3605 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3606 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3607 3608 if (CC(vmcs12->launch_state == launch)) 3609 return nested_vmx_fail(vcpu, 3610 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3611 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3612 3613 if (nested_vmx_check_controls(vcpu, vmcs12)) 3614 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3615 3616 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3617 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3618 3619 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3620 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3621 3622 /* 3623 * We're finally done with prerequisite checking, and can start with 3624 * the nested entry. 3625 */ 3626 vmx->nested.nested_run_pending = 1; 3627 vmx->nested.has_preemption_timer_deadline = false; 3628 status = nested_vmx_enter_non_root_mode(vcpu, true); 3629 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3630 goto vmentry_failed; 3631 3632 /* Emulate processing of posted interrupts on VM-Enter. */ 3633 if (nested_cpu_has_posted_intr(vmcs12) && 3634 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3635 vmx->nested.pi_pending = true; 3636 kvm_make_request(KVM_REQ_EVENT, vcpu); 3637 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3638 } 3639 3640 /* Hide L1D cache contents from the nested guest. */ 3641 vmx->vcpu.arch.l1tf_flush_l1d = true; 3642 3643 /* 3644 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3645 * also be used as part of restoring nVMX state for 3646 * snapshot restore (migration). 3647 * 3648 * In this flow, it is assumed that vmcs12 cache was 3649 * transferred as part of captured nVMX state and should 3650 * therefore not be read from guest memory (which may not 3651 * exist on destination host yet). 3652 */ 3653 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3654 3655 switch (vmcs12->guest_activity_state) { 3656 case GUEST_ACTIVITY_HLT: 3657 /* 3658 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3659 * awakened by event injection or by an NMI-window VM-exit or 3660 * by an interrupt-window VM-exit, halt the vcpu. 3661 */ 3662 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3663 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3664 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3665 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3666 vmx->nested.nested_run_pending = 0; 3667 return kvm_emulate_halt_noskip(vcpu); 3668 } 3669 break; 3670 case GUEST_ACTIVITY_WAIT_SIPI: 3671 vmx->nested.nested_run_pending = 0; 3672 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3673 break; 3674 default: 3675 break; 3676 } 3677 3678 return 1; 3679 3680 vmentry_failed: 3681 vmx->nested.nested_run_pending = 0; 3682 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3683 return 0; 3684 if (status == NVMX_VMENTRY_VMEXIT) 3685 return 1; 3686 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3687 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3688 } 3689 3690 /* 3691 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3692 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3693 * This function returns the new value we should put in vmcs12.guest_cr0. 3694 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3695 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3696 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3697 * didn't trap the bit, because if L1 did, so would L0). 3698 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3699 * been modified by L2, and L1 knows it. So just leave the old value of 3700 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3701 * isn't relevant, because if L0 traps this bit it can set it to anything. 3702 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3703 * changed these bits, and therefore they need to be updated, but L0 3704 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3705 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3706 */ 3707 static inline unsigned long 3708 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3709 { 3710 return 3711 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3712 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3713 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3714 vcpu->arch.cr0_guest_owned_bits)); 3715 } 3716 3717 static inline unsigned long 3718 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3719 { 3720 return 3721 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3722 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3723 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3724 vcpu->arch.cr4_guest_owned_bits)); 3725 } 3726 3727 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3728 struct vmcs12 *vmcs12, 3729 u32 vm_exit_reason, u32 exit_intr_info) 3730 { 3731 u32 idt_vectoring; 3732 unsigned int nr; 3733 3734 /* 3735 * Per the SDM, VM-Exits due to double and triple faults are never 3736 * considered to occur during event delivery, even if the double/triple 3737 * fault is the result of an escalating vectoring issue. 3738 * 3739 * Note, the SDM qualifies the double fault behavior with "The original 3740 * event results in a double-fault exception". It's unclear why the 3741 * qualification exists since exits due to double fault can occur only 3742 * while vectoring a different exception (injected events are never 3743 * subject to interception), i.e. there's _always_ an original event. 3744 * 3745 * The SDM also uses NMI as a confusing example for the "original event 3746 * causes the VM exit directly" clause. NMI isn't special in any way, 3747 * the same rule applies to all events that cause an exit directly. 3748 * NMI is an odd choice for the example because NMIs can only occur on 3749 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3750 */ 3751 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3752 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3753 is_double_fault(exit_intr_info))) { 3754 vmcs12->idt_vectoring_info_field = 0; 3755 } else if (vcpu->arch.exception.injected) { 3756 nr = vcpu->arch.exception.vector; 3757 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3758 3759 if (kvm_exception_is_soft(nr)) { 3760 vmcs12->vm_exit_instruction_len = 3761 vcpu->arch.event_exit_inst_len; 3762 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3763 } else 3764 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3765 3766 if (vcpu->arch.exception.has_error_code) { 3767 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3768 vmcs12->idt_vectoring_error_code = 3769 vcpu->arch.exception.error_code; 3770 } 3771 3772 vmcs12->idt_vectoring_info_field = idt_vectoring; 3773 } else if (vcpu->arch.nmi_injected) { 3774 vmcs12->idt_vectoring_info_field = 3775 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3776 } else if (vcpu->arch.interrupt.injected) { 3777 nr = vcpu->arch.interrupt.nr; 3778 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3779 3780 if (vcpu->arch.interrupt.soft) { 3781 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3782 vmcs12->vm_entry_instruction_len = 3783 vcpu->arch.event_exit_inst_len; 3784 } else 3785 idt_vectoring |= INTR_TYPE_EXT_INTR; 3786 3787 vmcs12->idt_vectoring_info_field = idt_vectoring; 3788 } else { 3789 vmcs12->idt_vectoring_info_field = 0; 3790 } 3791 } 3792 3793 3794 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3795 { 3796 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3797 gfn_t gfn; 3798 3799 /* 3800 * Don't need to mark the APIC access page dirty; it is never 3801 * written to by the CPU during APIC virtualization. 3802 */ 3803 3804 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3805 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3806 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3807 } 3808 3809 if (nested_cpu_has_posted_intr(vmcs12)) { 3810 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3811 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3812 } 3813 } 3814 3815 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3816 { 3817 struct vcpu_vmx *vmx = to_vmx(vcpu); 3818 int max_irr; 3819 void *vapic_page; 3820 u16 status; 3821 3822 if (!vmx->nested.pi_pending) 3823 return 0; 3824 3825 if (!vmx->nested.pi_desc) 3826 goto mmio_needed; 3827 3828 vmx->nested.pi_pending = false; 3829 3830 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3831 return 0; 3832 3833 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3834 if (max_irr != 256) { 3835 vapic_page = vmx->nested.virtual_apic_map.hva; 3836 if (!vapic_page) 3837 goto mmio_needed; 3838 3839 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3840 vapic_page, &max_irr); 3841 status = vmcs_read16(GUEST_INTR_STATUS); 3842 if ((u8)max_irr > ((u8)status & 0xff)) { 3843 status &= ~0xff; 3844 status |= (u8)max_irr; 3845 vmcs_write16(GUEST_INTR_STATUS, status); 3846 } 3847 } 3848 3849 nested_mark_vmcs12_pages_dirty(vcpu); 3850 return 0; 3851 3852 mmio_needed: 3853 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3854 return -ENXIO; 3855 } 3856 3857 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3858 { 3859 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3860 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3861 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3862 unsigned long exit_qual; 3863 3864 if (ex->has_payload) { 3865 exit_qual = ex->payload; 3866 } else if (ex->vector == PF_VECTOR) { 3867 exit_qual = vcpu->arch.cr2; 3868 } else if (ex->vector == DB_VECTOR) { 3869 exit_qual = vcpu->arch.dr6; 3870 exit_qual &= ~DR6_BT; 3871 exit_qual ^= DR6_ACTIVE_LOW; 3872 } else { 3873 exit_qual = 0; 3874 } 3875 3876 /* 3877 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3878 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3879 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 3880 */ 3881 if (ex->has_error_code && is_protmode(vcpu)) { 3882 /* 3883 * Intel CPUs do not generate error codes with bits 31:16 set, 3884 * and more importantly VMX disallows setting bits 31:16 in the 3885 * injected error code for VM-Entry. Drop the bits to mimic 3886 * hardware and avoid inducing failure on nested VM-Entry if L1 3887 * chooses to inject the exception back to L2. AMD CPUs _do_ 3888 * generate "full" 32-bit error codes, so KVM allows userspace 3889 * to inject exception error codes with bits 31:16 set. 3890 */ 3891 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 3892 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3893 } 3894 3895 if (kvm_exception_is_soft(ex->vector)) 3896 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3897 else 3898 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3899 3900 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3901 vmx_get_nmi_mask(vcpu)) 3902 intr_info |= INTR_INFO_UNBLOCK_NMI; 3903 3904 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3905 } 3906 3907 /* 3908 * Returns true if a debug trap is (likely) pending delivery. Infer the class 3909 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 3910 * Using the payload is flawed because code breakpoints (fault-like) and data 3911 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 3912 * this will return false positives if a to-be-injected code breakpoint #DB is 3913 * pending (from KVM's perspective, but not "pending" across an instruction 3914 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 3915 * too is trap-like. 3916 * 3917 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 3918 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 3919 * #DB has already happened), and MTF isn't marked pending on code breakpoints 3920 * from the emulator (because such #DBs are fault-like and thus don't trigger 3921 * actions that fire on instruction retire). 3922 */ 3923 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 3924 { 3925 if (!ex->pending || ex->vector != DB_VECTOR) 3926 return 0; 3927 3928 /* General Detect #DBs are always fault-like. */ 3929 return ex->payload & ~DR6_BD; 3930 } 3931 3932 /* 3933 * Returns true if there's a pending #DB exception that is lower priority than 3934 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 3935 * KVM, but could theoretically be injected by userspace. Note, this code is 3936 * imperfect, see above. 3937 */ 3938 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 3939 { 3940 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 3941 } 3942 3943 /* 3944 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3945 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3946 * represents these debug traps with a payload that is said to be compatible 3947 * with the 'pending debug exceptions' field, write the payload to the VMCS 3948 * field if a VM-exit is delivered before the debug trap. 3949 */ 3950 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3951 { 3952 unsigned long pending_dbg; 3953 3954 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 3955 if (pending_dbg) 3956 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 3957 } 3958 3959 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3960 { 3961 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3962 to_vmx(vcpu)->nested.preemption_timer_expired; 3963 } 3964 3965 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) 3966 { 3967 return nested_vmx_preemption_timer_pending(vcpu) || 3968 to_vmx(vcpu)->nested.mtf_pending; 3969 } 3970 3971 /* 3972 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 3973 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 3974 * and less minor edits to splice in the priority of VMX Non-Root specific 3975 * events, e.g. MTF and NMI/INTR-window exiting. 3976 * 3977 * 1 Hardware Reset and Machine Checks 3978 * - RESET 3979 * - Machine Check 3980 * 3981 * 2 Trap on Task Switch 3982 * - T flag in TSS is set (on task switch) 3983 * 3984 * 3 External Hardware Interventions 3985 * - FLUSH 3986 * - STOPCLK 3987 * - SMI 3988 * - INIT 3989 * 3990 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 3991 * 3992 * 4 Traps on Previous Instruction 3993 * - Breakpoints 3994 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 3995 * breakpoint, or #DB due to a split-lock access) 3996 * 3997 * 4.3 VMX-preemption timer expired VM-exit 3998 * 3999 * 4.6 NMI-window exiting VM-exit[2] 4000 * 4001 * 5 Nonmaskable Interrupts (NMI) 4002 * 4003 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4004 * 4005 * 6 Maskable Hardware Interrupts 4006 * 4007 * 7 Code Breakpoint Fault 4008 * 4009 * 8 Faults from Fetching Next Instruction 4010 * - Code-Segment Limit Violation 4011 * - Code Page Fault 4012 * - Control protection exception (missing ENDBRANCH at target of indirect 4013 * call or jump) 4014 * 4015 * 9 Faults from Decoding Next Instruction 4016 * - Instruction length > 15 bytes 4017 * - Invalid Opcode 4018 * - Coprocessor Not Available 4019 * 4020 *10 Faults on Executing Instruction 4021 * - Overflow 4022 * - Bound error 4023 * - Invalid TSS 4024 * - Segment Not Present 4025 * - Stack fault 4026 * - General Protection 4027 * - Data Page Fault 4028 * - Alignment Check 4029 * - x86 FPU Floating-point exception 4030 * - SIMD floating-point exception 4031 * - Virtualization exception 4032 * - Control protection exception 4033 * 4034 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4035 * INIT signals, and higher priority events take priority over MTF VM exits. 4036 * MTF VM exits take priority over debug-trap exceptions and lower priority 4037 * events. 4038 * 4039 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4040 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4041 * timer take priority over VM exits caused by the "NMI-window exiting" 4042 * VM-execution control and lower priority events. 4043 * 4044 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4045 * caused by "NMI-window exiting". VM exits caused by this control take 4046 * priority over non-maskable interrupts (NMIs) and lower priority events. 4047 * 4048 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4049 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4050 * non-maskable interrupts (NMIs) and higher priority events take priority over 4051 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4052 * priority over external interrupts and lower priority events. 4053 */ 4054 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4055 { 4056 struct kvm_lapic *apic = vcpu->arch.apic; 4057 struct vcpu_vmx *vmx = to_vmx(vcpu); 4058 /* 4059 * Only a pending nested run blocks a pending exception. If there is a 4060 * previously injected event, the pending exception occurred while said 4061 * event was being delivered and thus needs to be handled. 4062 */ 4063 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4064 /* 4065 * New events (not exceptions) are only recognized at instruction 4066 * boundaries. If an event needs reinjection, then KVM is handling a 4067 * VM-Exit that occurred _during_ instruction execution; new events are 4068 * blocked until the instruction completes. 4069 */ 4070 bool block_nested_events = block_nested_exceptions || 4071 kvm_event_needs_reinjection(vcpu); 4072 4073 if (lapic_in_kernel(vcpu) && 4074 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4075 if (block_nested_events) 4076 return -EBUSY; 4077 nested_vmx_update_pending_dbg(vcpu); 4078 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4079 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4080 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4081 4082 /* MTF is discarded if the vCPU is in WFS. */ 4083 vmx->nested.mtf_pending = false; 4084 return 0; 4085 } 4086 4087 if (lapic_in_kernel(vcpu) && 4088 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4089 if (block_nested_events) 4090 return -EBUSY; 4091 4092 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4093 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4094 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4095 apic->sipi_vector & 0xFFUL); 4096 return 0; 4097 } 4098 /* Fallthrough, the SIPI is completely ignored. */ 4099 } 4100 4101 /* 4102 * Process exceptions that are higher priority than Monitor Trap Flag: 4103 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4104 * could theoretically come in from userspace), and ICEBP (INT1). 4105 * 4106 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4107 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4108 * across SMI/RSM as it should; that needs to be addressed in order to 4109 * prioritize SMI over MTF and trap-like #DBs. 4110 */ 4111 if (vcpu->arch.exception_vmexit.pending && 4112 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4113 if (block_nested_exceptions) 4114 return -EBUSY; 4115 4116 nested_vmx_inject_exception_vmexit(vcpu); 4117 return 0; 4118 } 4119 4120 if (vcpu->arch.exception.pending && 4121 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4122 if (block_nested_exceptions) 4123 return -EBUSY; 4124 goto no_vmexit; 4125 } 4126 4127 if (vmx->nested.mtf_pending) { 4128 if (block_nested_events) 4129 return -EBUSY; 4130 nested_vmx_update_pending_dbg(vcpu); 4131 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4132 return 0; 4133 } 4134 4135 if (vcpu->arch.exception_vmexit.pending) { 4136 if (block_nested_exceptions) 4137 return -EBUSY; 4138 4139 nested_vmx_inject_exception_vmexit(vcpu); 4140 return 0; 4141 } 4142 4143 if (vcpu->arch.exception.pending) { 4144 if (block_nested_exceptions) 4145 return -EBUSY; 4146 goto no_vmexit; 4147 } 4148 4149 if (nested_vmx_preemption_timer_pending(vcpu)) { 4150 if (block_nested_events) 4151 return -EBUSY; 4152 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4153 return 0; 4154 } 4155 4156 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4157 if (block_nested_events) 4158 return -EBUSY; 4159 goto no_vmexit; 4160 } 4161 4162 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4163 if (block_nested_events) 4164 return -EBUSY; 4165 if (!nested_exit_on_nmi(vcpu)) 4166 goto no_vmexit; 4167 4168 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4169 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4170 INTR_INFO_VALID_MASK, 0); 4171 /* 4172 * The NMI-triggered VM exit counts as injection: 4173 * clear this one and block further NMIs. 4174 */ 4175 vcpu->arch.nmi_pending = 0; 4176 vmx_set_nmi_mask(vcpu, true); 4177 return 0; 4178 } 4179 4180 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4181 if (block_nested_events) 4182 return -EBUSY; 4183 if (!nested_exit_on_intr(vcpu)) 4184 goto no_vmexit; 4185 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4186 return 0; 4187 } 4188 4189 no_vmexit: 4190 return vmx_complete_nested_posted_interrupt(vcpu); 4191 } 4192 4193 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4194 { 4195 ktime_t remaining = 4196 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4197 u64 value; 4198 4199 if (ktime_to_ns(remaining) <= 0) 4200 return 0; 4201 4202 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4203 do_div(value, 1000000); 4204 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4205 } 4206 4207 static bool is_vmcs12_ext_field(unsigned long field) 4208 { 4209 switch (field) { 4210 case GUEST_ES_SELECTOR: 4211 case GUEST_CS_SELECTOR: 4212 case GUEST_SS_SELECTOR: 4213 case GUEST_DS_SELECTOR: 4214 case GUEST_FS_SELECTOR: 4215 case GUEST_GS_SELECTOR: 4216 case GUEST_LDTR_SELECTOR: 4217 case GUEST_TR_SELECTOR: 4218 case GUEST_ES_LIMIT: 4219 case GUEST_CS_LIMIT: 4220 case GUEST_SS_LIMIT: 4221 case GUEST_DS_LIMIT: 4222 case GUEST_FS_LIMIT: 4223 case GUEST_GS_LIMIT: 4224 case GUEST_LDTR_LIMIT: 4225 case GUEST_TR_LIMIT: 4226 case GUEST_GDTR_LIMIT: 4227 case GUEST_IDTR_LIMIT: 4228 case GUEST_ES_AR_BYTES: 4229 case GUEST_DS_AR_BYTES: 4230 case GUEST_FS_AR_BYTES: 4231 case GUEST_GS_AR_BYTES: 4232 case GUEST_LDTR_AR_BYTES: 4233 case GUEST_TR_AR_BYTES: 4234 case GUEST_ES_BASE: 4235 case GUEST_CS_BASE: 4236 case GUEST_SS_BASE: 4237 case GUEST_DS_BASE: 4238 case GUEST_FS_BASE: 4239 case GUEST_GS_BASE: 4240 case GUEST_LDTR_BASE: 4241 case GUEST_TR_BASE: 4242 case GUEST_GDTR_BASE: 4243 case GUEST_IDTR_BASE: 4244 case GUEST_PENDING_DBG_EXCEPTIONS: 4245 case GUEST_BNDCFGS: 4246 return true; 4247 default: 4248 break; 4249 } 4250 4251 return false; 4252 } 4253 4254 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4255 struct vmcs12 *vmcs12) 4256 { 4257 struct vcpu_vmx *vmx = to_vmx(vcpu); 4258 4259 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4260 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4261 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4262 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4263 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4264 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4265 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4266 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4267 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4268 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4269 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4270 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4271 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4272 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4273 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4274 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4275 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4276 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4277 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4278 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4279 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4280 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4281 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4282 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4283 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4284 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4285 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4286 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4287 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4288 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4289 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4290 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4291 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4292 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4293 vmcs12->guest_pending_dbg_exceptions = 4294 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4295 4296 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4297 } 4298 4299 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4300 struct vmcs12 *vmcs12) 4301 { 4302 struct vcpu_vmx *vmx = to_vmx(vcpu); 4303 int cpu; 4304 4305 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4306 return; 4307 4308 4309 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4310 4311 cpu = get_cpu(); 4312 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4313 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4314 4315 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4316 4317 vmx->loaded_vmcs = &vmx->vmcs01; 4318 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4319 put_cpu(); 4320 } 4321 4322 /* 4323 * Update the guest state fields of vmcs12 to reflect changes that 4324 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4325 * VM-entry controls is also updated, since this is really a guest 4326 * state bit.) 4327 */ 4328 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4329 { 4330 struct vcpu_vmx *vmx = to_vmx(vcpu); 4331 4332 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 4333 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4334 4335 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4336 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); 4337 4338 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4339 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4340 4341 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4342 vmcs12->guest_rip = kvm_rip_read(vcpu); 4343 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4344 4345 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4346 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4347 4348 vmcs12->guest_interruptibility_info = 4349 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4350 4351 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4352 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4353 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4354 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4355 else 4356 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4357 4358 if (nested_cpu_has_preemption_timer(vmcs12) && 4359 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4360 !vmx->nested.nested_run_pending) 4361 vmcs12->vmx_preemption_timer_value = 4362 vmx_get_preemption_timer_value(vcpu); 4363 4364 /* 4365 * In some cases (usually, nested EPT), L2 is allowed to change its 4366 * own CR3 without exiting. If it has changed it, we must keep it. 4367 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4368 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4369 * 4370 * Additionally, restore L2's PDPTR to vmcs12. 4371 */ 4372 if (enable_ept) { 4373 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4374 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4375 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4376 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4377 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4378 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4379 } 4380 } 4381 4382 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4383 4384 if (nested_cpu_has_vid(vmcs12)) 4385 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4386 4387 vmcs12->vm_entry_controls = 4388 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4389 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4390 4391 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4392 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4393 4394 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4395 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4396 } 4397 4398 /* 4399 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4400 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4401 * and this function updates it to reflect the changes to the guest state while 4402 * L2 was running (and perhaps made some exits which were handled directly by L0 4403 * without going back to L1), and to reflect the exit reason. 4404 * Note that we do not have to copy here all VMCS fields, just those that 4405 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4406 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4407 * which already writes to vmcs12 directly. 4408 */ 4409 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4410 u32 vm_exit_reason, u32 exit_intr_info, 4411 unsigned long exit_qualification) 4412 { 4413 /* update exit information fields: */ 4414 vmcs12->vm_exit_reason = vm_exit_reason; 4415 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4416 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4417 vmcs12->exit_qualification = exit_qualification; 4418 4419 /* 4420 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4421 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4422 * exit info fields are unmodified. 4423 */ 4424 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4425 vmcs12->launch_state = 1; 4426 4427 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4428 * instead of reading the real value. */ 4429 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4430 4431 /* 4432 * Transfer the event that L0 or L1 may wanted to inject into 4433 * L2 to IDT_VECTORING_INFO_FIELD. 4434 */ 4435 vmcs12_save_pending_event(vcpu, vmcs12, 4436 vm_exit_reason, exit_intr_info); 4437 4438 vmcs12->vm_exit_intr_info = exit_intr_info; 4439 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4440 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4441 4442 /* 4443 * According to spec, there's no need to store the guest's 4444 * MSRs if the exit is due to a VM-entry failure that occurs 4445 * during or after loading the guest state. Since this exit 4446 * does not fall in that category, we need to save the MSRs. 4447 */ 4448 if (nested_vmx_store_msr(vcpu, 4449 vmcs12->vm_exit_msr_store_addr, 4450 vmcs12->vm_exit_msr_store_count)) 4451 nested_vmx_abort(vcpu, 4452 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4453 } 4454 } 4455 4456 /* 4457 * A part of what we need to when the nested L2 guest exits and we want to 4458 * run its L1 parent, is to reset L1's guest state to the host state specified 4459 * in vmcs12. 4460 * This function is to be called not only on normal nested exit, but also on 4461 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4462 * Failures During or After Loading Guest State"). 4463 * This function should be called when the active VMCS is L1's (vmcs01). 4464 */ 4465 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4466 struct vmcs12 *vmcs12) 4467 { 4468 enum vm_entry_failure_code ignored; 4469 struct kvm_segment seg; 4470 4471 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4472 vcpu->arch.efer = vmcs12->host_ia32_efer; 4473 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4474 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4475 else 4476 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4477 vmx_set_efer(vcpu, vcpu->arch.efer); 4478 4479 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4480 kvm_rip_write(vcpu, vmcs12->host_rip); 4481 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4482 vmx_set_interrupt_shadow(vcpu, 0); 4483 4484 /* 4485 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4486 * actually changed, because vmx_set_cr0 refers to efer set above. 4487 * 4488 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4489 * (KVM doesn't change it); 4490 */ 4491 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4492 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4493 4494 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4495 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4496 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4497 4498 nested_ept_uninit_mmu_context(vcpu); 4499 4500 /* 4501 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4502 * couldn't have changed. 4503 */ 4504 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4505 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4506 4507 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4508 4509 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4510 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4511 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4512 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4513 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4514 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4515 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4516 4517 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4518 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4519 vmcs_write64(GUEST_BNDCFGS, 0); 4520 4521 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4522 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4523 vcpu->arch.pat = vmcs12->host_ia32_pat; 4524 } 4525 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4526 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4527 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4528 vmcs12->host_ia32_perf_global_ctrl)); 4529 4530 /* Set L1 segment info according to Intel SDM 4531 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4532 seg = (struct kvm_segment) { 4533 .base = 0, 4534 .limit = 0xFFFFFFFF, 4535 .selector = vmcs12->host_cs_selector, 4536 .type = 11, 4537 .present = 1, 4538 .s = 1, 4539 .g = 1 4540 }; 4541 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4542 seg.l = 1; 4543 else 4544 seg.db = 1; 4545 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4546 seg = (struct kvm_segment) { 4547 .base = 0, 4548 .limit = 0xFFFFFFFF, 4549 .type = 3, 4550 .present = 1, 4551 .s = 1, 4552 .db = 1, 4553 .g = 1 4554 }; 4555 seg.selector = vmcs12->host_ds_selector; 4556 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4557 seg.selector = vmcs12->host_es_selector; 4558 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4559 seg.selector = vmcs12->host_ss_selector; 4560 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4561 seg.selector = vmcs12->host_fs_selector; 4562 seg.base = vmcs12->host_fs_base; 4563 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4564 seg.selector = vmcs12->host_gs_selector; 4565 seg.base = vmcs12->host_gs_base; 4566 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4567 seg = (struct kvm_segment) { 4568 .base = vmcs12->host_tr_base, 4569 .limit = 0x67, 4570 .selector = vmcs12->host_tr_selector, 4571 .type = 11, 4572 .present = 1 4573 }; 4574 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4575 4576 memset(&seg, 0, sizeof(seg)); 4577 seg.unusable = 1; 4578 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4579 4580 kvm_set_dr(vcpu, 7, 0x400); 4581 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4582 4583 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4584 vmcs12->vm_exit_msr_load_count)) 4585 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4586 4587 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4588 } 4589 4590 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4591 { 4592 struct vmx_uret_msr *efer_msr; 4593 unsigned int i; 4594 4595 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4596 return vmcs_read64(GUEST_IA32_EFER); 4597 4598 if (cpu_has_load_ia32_efer()) 4599 return host_efer; 4600 4601 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4602 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4603 return vmx->msr_autoload.guest.val[i].value; 4604 } 4605 4606 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4607 if (efer_msr) 4608 return efer_msr->data; 4609 4610 return host_efer; 4611 } 4612 4613 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4614 { 4615 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4616 struct vcpu_vmx *vmx = to_vmx(vcpu); 4617 struct vmx_msr_entry g, h; 4618 gpa_t gpa; 4619 u32 i, j; 4620 4621 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4622 4623 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4624 /* 4625 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4626 * as vmcs01.GUEST_DR7 contains a userspace defined value 4627 * and vcpu->arch.dr7 is not squirreled away before the 4628 * nested VMENTER (not worth adding a variable in nested_vmx). 4629 */ 4630 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4631 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4632 else 4633 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4634 } 4635 4636 /* 4637 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4638 * handle a variety of side effects to KVM's software model. 4639 */ 4640 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4641 4642 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4643 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4644 4645 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4646 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4647 4648 nested_ept_uninit_mmu_context(vcpu); 4649 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4650 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4651 4652 /* 4653 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4654 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4655 * VMFail, like everything else we just need to ensure our 4656 * software model is up-to-date. 4657 */ 4658 if (enable_ept && is_pae_paging(vcpu)) 4659 ept_save_pdptrs(vcpu); 4660 4661 kvm_mmu_reset_context(vcpu); 4662 4663 /* 4664 * This nasty bit of open coding is a compromise between blindly 4665 * loading L1's MSRs using the exit load lists (incorrect emulation 4666 * of VMFail), leaving the nested VM's MSRs in the software model 4667 * (incorrect behavior) and snapshotting the modified MSRs (too 4668 * expensive since the lists are unbound by hardware). For each 4669 * MSR that was (prematurely) loaded from the nested VMEntry load 4670 * list, reload it from the exit load list if it exists and differs 4671 * from the guest value. The intent is to stuff host state as 4672 * silently as possible, not to fully process the exit load list. 4673 */ 4674 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4675 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4676 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4677 pr_debug_ratelimited( 4678 "%s read MSR index failed (%u, 0x%08llx)\n", 4679 __func__, i, gpa); 4680 goto vmabort; 4681 } 4682 4683 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4684 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4685 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4686 pr_debug_ratelimited( 4687 "%s read MSR failed (%u, 0x%08llx)\n", 4688 __func__, j, gpa); 4689 goto vmabort; 4690 } 4691 if (h.index != g.index) 4692 continue; 4693 if (h.value == g.value) 4694 break; 4695 4696 if (nested_vmx_load_msr_check(vcpu, &h)) { 4697 pr_debug_ratelimited( 4698 "%s check failed (%u, 0x%x, 0x%x)\n", 4699 __func__, j, h.index, h.reserved); 4700 goto vmabort; 4701 } 4702 4703 if (kvm_set_msr(vcpu, h.index, h.value)) { 4704 pr_debug_ratelimited( 4705 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4706 __func__, j, h.index, h.value); 4707 goto vmabort; 4708 } 4709 } 4710 } 4711 4712 return; 4713 4714 vmabort: 4715 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4716 } 4717 4718 /* 4719 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4720 * and modify vmcs12 to make it see what it would expect to see there if 4721 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4722 */ 4723 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4724 u32 exit_intr_info, unsigned long exit_qualification) 4725 { 4726 struct vcpu_vmx *vmx = to_vmx(vcpu); 4727 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4728 4729 /* Pending MTF traps are discarded on VM-Exit. */ 4730 vmx->nested.mtf_pending = false; 4731 4732 /* trying to cancel vmlaunch/vmresume is a bug */ 4733 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4734 4735 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4736 /* 4737 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4738 * Enlightened VMCS after migration and we still need to 4739 * do that when something is forcing L2->L1 exit prior to 4740 * the first L2 run. 4741 */ 4742 (void)nested_get_evmcs_page(vcpu); 4743 } 4744 4745 /* Service pending TLB flush requests for L2 before switching to L1. */ 4746 kvm_service_local_tlb_flush_requests(vcpu); 4747 4748 /* 4749 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4750 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4751 * up-to-date before switching to L1. 4752 */ 4753 if (enable_ept && is_pae_paging(vcpu)) 4754 vmx_ept_load_pdptrs(vcpu); 4755 4756 leave_guest_mode(vcpu); 4757 4758 if (nested_cpu_has_preemption_timer(vmcs12)) 4759 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4760 4761 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4762 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4763 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4764 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4765 } 4766 4767 if (likely(!vmx->fail)) { 4768 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4769 4770 if (vm_exit_reason != -1) 4771 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4772 exit_intr_info, exit_qualification); 4773 4774 /* 4775 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4776 * also be used to capture vmcs12 cache as part of 4777 * capturing nVMX state for snapshot (migration). 4778 * 4779 * Otherwise, this flush will dirty guest memory at a 4780 * point it is already assumed by user-space to be 4781 * immutable. 4782 */ 4783 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4784 } else { 4785 /* 4786 * The only expected VM-instruction error is "VM entry with 4787 * invalid control field(s)." Anything else indicates a 4788 * problem with L0. And we should never get here with a 4789 * VMFail of any type if early consistency checks are enabled. 4790 */ 4791 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4792 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4793 WARN_ON_ONCE(nested_early_check); 4794 } 4795 4796 /* 4797 * Drop events/exceptions that were queued for re-injection to L2 4798 * (picked up via vmx_complete_interrupts()), as well as exceptions 4799 * that were pending for L2. Note, this must NOT be hoisted above 4800 * prepare_vmcs12(), events/exceptions queued for re-injection need to 4801 * be captured in vmcs12 (see vmcs12_save_pending_event()). 4802 */ 4803 vcpu->arch.nmi_injected = false; 4804 kvm_clear_exception_queue(vcpu); 4805 kvm_clear_interrupt_queue(vcpu); 4806 4807 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4808 4809 /* 4810 * If IBRS is advertised to the vCPU, KVM must flush the indirect 4811 * branch predictors when transitioning from L2 to L1, as L1 expects 4812 * hardware (KVM in this case) to provide separate predictor modes. 4813 * Bare metal isolates VMX root (host) from VMX non-root (guest), but 4814 * doesn't isolate different VMCSs, i.e. in this case, doesn't provide 4815 * separate modes for L2 vs L1. 4816 */ 4817 if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 4818 indirect_branch_prediction_barrier(); 4819 4820 /* Update any VMCS fields that might have changed while L2 ran */ 4821 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4822 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4823 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4824 if (kvm_caps.has_tsc_control) 4825 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4826 4827 if (vmx->nested.l1_tpr_threshold != -1) 4828 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4829 4830 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4831 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4832 vmx_set_virtual_apic_mode(vcpu); 4833 } 4834 4835 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4836 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4837 vmx_update_cpu_dirty_logging(vcpu); 4838 } 4839 4840 /* Unpin physical memory we referred to in vmcs02 */ 4841 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 4842 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4843 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4844 vmx->nested.pi_desc = NULL; 4845 4846 if (vmx->nested.reload_vmcs01_apic_access_page) { 4847 vmx->nested.reload_vmcs01_apic_access_page = false; 4848 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4849 } 4850 4851 if (vmx->nested.update_vmcs01_apicv_status) { 4852 vmx->nested.update_vmcs01_apicv_status = false; 4853 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 4854 } 4855 4856 if ((vm_exit_reason != -1) && 4857 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) 4858 vmx->nested.need_vmcs12_to_shadow_sync = true; 4859 4860 /* in case we halted in L2 */ 4861 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4862 4863 if (likely(!vmx->fail)) { 4864 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4865 nested_exit_intr_ack_set(vcpu)) { 4866 int irq = kvm_cpu_get_interrupt(vcpu); 4867 WARN_ON(irq < 0); 4868 vmcs12->vm_exit_intr_info = irq | 4869 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4870 } 4871 4872 if (vm_exit_reason != -1) 4873 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4874 vmcs12->exit_qualification, 4875 vmcs12->idt_vectoring_info_field, 4876 vmcs12->vm_exit_intr_info, 4877 vmcs12->vm_exit_intr_error_code, 4878 KVM_ISA_VMX); 4879 4880 load_vmcs12_host_state(vcpu, vmcs12); 4881 4882 return; 4883 } 4884 4885 /* 4886 * After an early L2 VM-entry failure, we're now back 4887 * in L1 which thinks it just finished a VMLAUNCH or 4888 * VMRESUME instruction, so we need to set the failure 4889 * flag and the VM-instruction error field of the VMCS 4890 * accordingly, and skip the emulated instruction. 4891 */ 4892 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4893 4894 /* 4895 * Restore L1's host state to KVM's software model. We're here 4896 * because a consistency check was caught by hardware, which 4897 * means some amount of guest state has been propagated to KVM's 4898 * model and needs to be unwound to the host's state. 4899 */ 4900 nested_vmx_restore_host_state(vcpu); 4901 4902 vmx->fail = 0; 4903 } 4904 4905 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4906 { 4907 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4908 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4909 } 4910 4911 /* 4912 * Decode the memory-address operand of a vmx instruction, as recorded on an 4913 * exit caused by such an instruction (run by a guest hypervisor). 4914 * On success, returns 0. When the operand is invalid, returns 1 and throws 4915 * #UD, #GP, or #SS. 4916 */ 4917 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4918 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4919 { 4920 gva_t off; 4921 bool exn; 4922 struct kvm_segment s; 4923 4924 /* 4925 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4926 * Execution", on an exit, vmx_instruction_info holds most of the 4927 * addressing components of the operand. Only the displacement part 4928 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4929 * For how an actual address is calculated from all these components, 4930 * refer to Vol. 1, "Operand Addressing". 4931 */ 4932 int scaling = vmx_instruction_info & 3; 4933 int addr_size = (vmx_instruction_info >> 7) & 7; 4934 bool is_reg = vmx_instruction_info & (1u << 10); 4935 int seg_reg = (vmx_instruction_info >> 15) & 7; 4936 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4937 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4938 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4939 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4940 4941 if (is_reg) { 4942 kvm_queue_exception(vcpu, UD_VECTOR); 4943 return 1; 4944 } 4945 4946 /* Addr = segment_base + offset */ 4947 /* offset = base + [index * scale] + displacement */ 4948 off = exit_qualification; /* holds the displacement */ 4949 if (addr_size == 1) 4950 off = (gva_t)sign_extend64(off, 31); 4951 else if (addr_size == 0) 4952 off = (gva_t)sign_extend64(off, 15); 4953 if (base_is_valid) 4954 off += kvm_register_read(vcpu, base_reg); 4955 if (index_is_valid) 4956 off += kvm_register_read(vcpu, index_reg) << scaling; 4957 vmx_get_segment(vcpu, &s, seg_reg); 4958 4959 /* 4960 * The effective address, i.e. @off, of a memory operand is truncated 4961 * based on the address size of the instruction. Note that this is 4962 * the *effective address*, i.e. the address prior to accounting for 4963 * the segment's base. 4964 */ 4965 if (addr_size == 1) /* 32 bit */ 4966 off &= 0xffffffff; 4967 else if (addr_size == 0) /* 16 bit */ 4968 off &= 0xffff; 4969 4970 /* Checks for #GP/#SS exceptions. */ 4971 exn = false; 4972 if (is_long_mode(vcpu)) { 4973 /* 4974 * The virtual/linear address is never truncated in 64-bit 4975 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4976 * address when using FS/GS with a non-zero base. 4977 */ 4978 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4979 *ret = s.base + off; 4980 else 4981 *ret = off; 4982 4983 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4984 * non-canonical form. This is the only check on the memory 4985 * destination for long mode! 4986 */ 4987 exn = is_noncanonical_address(*ret, vcpu); 4988 } else { 4989 /* 4990 * When not in long mode, the virtual/linear address is 4991 * unconditionally truncated to 32 bits regardless of the 4992 * address size. 4993 */ 4994 *ret = (s.base + off) & 0xffffffff; 4995 4996 /* Protected mode: apply checks for segment validity in the 4997 * following order: 4998 * - segment type check (#GP(0) may be thrown) 4999 * - usability check (#GP(0)/#SS(0)) 5000 * - limit check (#GP(0)/#SS(0)) 5001 */ 5002 if (wr) 5003 /* #GP(0) if the destination operand is located in a 5004 * read-only data segment or any code segment. 5005 */ 5006 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5007 else 5008 /* #GP(0) if the source operand is located in an 5009 * execute-only code segment 5010 */ 5011 exn = ((s.type & 0xa) == 8); 5012 if (exn) { 5013 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5014 return 1; 5015 } 5016 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5017 */ 5018 exn = (s.unusable != 0); 5019 5020 /* 5021 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5022 * outside the segment limit. All CPUs that support VMX ignore 5023 * limit checks for flat segments, i.e. segments with base==0, 5024 * limit==0xffffffff and of type expand-up data or code. 5025 */ 5026 if (!(s.base == 0 && s.limit == 0xffffffff && 5027 ((s.type & 8) || !(s.type & 4)))) 5028 exn = exn || ((u64)off + len - 1 > s.limit); 5029 } 5030 if (exn) { 5031 kvm_queue_exception_e(vcpu, 5032 seg_reg == VCPU_SREG_SS ? 5033 SS_VECTOR : GP_VECTOR, 5034 0); 5035 return 1; 5036 } 5037 5038 return 0; 5039 } 5040 5041 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5042 int *ret) 5043 { 5044 gva_t gva; 5045 struct x86_exception e; 5046 int r; 5047 5048 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5049 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5050 sizeof(*vmpointer), &gva)) { 5051 *ret = 1; 5052 return -EINVAL; 5053 } 5054 5055 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5056 if (r != X86EMUL_CONTINUE) { 5057 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5058 return -EINVAL; 5059 } 5060 5061 return 0; 5062 } 5063 5064 /* 5065 * Allocate a shadow VMCS and associate it with the currently loaded 5066 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5067 * VMCS is also VMCLEARed, so that it is ready for use. 5068 */ 5069 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5070 { 5071 struct vcpu_vmx *vmx = to_vmx(vcpu); 5072 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5073 5074 /* 5075 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5076 * when L1 executes VMXOFF or the vCPU is forced out of nested 5077 * operation. VMXON faults if the CPU is already post-VMXON, so it 5078 * should be impossible to already have an allocated shadow VMCS. KVM 5079 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5080 * always be the loaded VMCS. 5081 */ 5082 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5083 return loaded_vmcs->shadow_vmcs; 5084 5085 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5086 if (loaded_vmcs->shadow_vmcs) 5087 vmcs_clear(loaded_vmcs->shadow_vmcs); 5088 5089 return loaded_vmcs->shadow_vmcs; 5090 } 5091 5092 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5093 { 5094 struct vcpu_vmx *vmx = to_vmx(vcpu); 5095 int r; 5096 5097 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5098 if (r < 0) 5099 goto out_vmcs02; 5100 5101 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5102 if (!vmx->nested.cached_vmcs12) 5103 goto out_cached_vmcs12; 5104 5105 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5106 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5107 if (!vmx->nested.cached_shadow_vmcs12) 5108 goto out_cached_shadow_vmcs12; 5109 5110 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5111 goto out_shadow_vmcs; 5112 5113 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 5114 HRTIMER_MODE_ABS_PINNED); 5115 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 5116 5117 vmx->nested.vpid02 = allocate_vpid(); 5118 5119 vmx->nested.vmcs02_initialized = false; 5120 vmx->nested.vmxon = true; 5121 5122 if (vmx_pt_mode_is_host_guest()) { 5123 vmx->pt_desc.guest.ctl = 0; 5124 pt_update_intercept_for_msr(vcpu); 5125 } 5126 5127 return 0; 5128 5129 out_shadow_vmcs: 5130 kfree(vmx->nested.cached_shadow_vmcs12); 5131 5132 out_cached_shadow_vmcs12: 5133 kfree(vmx->nested.cached_vmcs12); 5134 5135 out_cached_vmcs12: 5136 free_loaded_vmcs(&vmx->nested.vmcs02); 5137 5138 out_vmcs02: 5139 return -ENOMEM; 5140 } 5141 5142 /* Emulate the VMXON instruction. */ 5143 static int handle_vmxon(struct kvm_vcpu *vcpu) 5144 { 5145 int ret; 5146 gpa_t vmptr; 5147 uint32_t revision; 5148 struct vcpu_vmx *vmx = to_vmx(vcpu); 5149 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5150 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5151 5152 /* 5153 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5154 * the guest and so cannot rely on hardware to perform the check, 5155 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5156 * for VMXON). 5157 * 5158 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5159 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5160 * force any of the relevant guest state. For a restricted guest, KVM 5161 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5162 * Real Mode, and so there's no need to check CR0.PE manually. 5163 */ 5164 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5165 kvm_queue_exception(vcpu, UD_VECTOR); 5166 return 1; 5167 } 5168 5169 /* 5170 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5171 * and has higher priority than the VM-Fail due to being post-VMXON, 5172 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5173 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5174 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5175 * VMX non-root. 5176 * 5177 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5178 * #UD checks (see above), is functionally ok because KVM doesn't allow 5179 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5180 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5181 * missed by hardware due to shadowing CR0 and/or CR4. 5182 */ 5183 if (vmx_get_cpl(vcpu)) { 5184 kvm_inject_gp(vcpu, 0); 5185 return 1; 5186 } 5187 5188 if (vmx->nested.vmxon) 5189 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5190 5191 /* 5192 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5193 * only if the vCPU isn't already in VMX operation, i.e. effectively 5194 * have lower priority than the VM-Fail above. 5195 */ 5196 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5197 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5198 kvm_inject_gp(vcpu, 0); 5199 return 1; 5200 } 5201 5202 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5203 != VMXON_NEEDED_FEATURES) { 5204 kvm_inject_gp(vcpu, 0); 5205 return 1; 5206 } 5207 5208 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5209 return ret; 5210 5211 /* 5212 * SDM 3: 24.11.5 5213 * The first 4 bytes of VMXON region contain the supported 5214 * VMCS revision identifier 5215 * 5216 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5217 * which replaces physical address width with 32 5218 */ 5219 if (!page_address_valid(vcpu, vmptr)) 5220 return nested_vmx_failInvalid(vcpu); 5221 5222 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5223 revision != VMCS12_REVISION) 5224 return nested_vmx_failInvalid(vcpu); 5225 5226 vmx->nested.vmxon_ptr = vmptr; 5227 ret = enter_vmx_operation(vcpu); 5228 if (ret) 5229 return ret; 5230 5231 return nested_vmx_succeed(vcpu); 5232 } 5233 5234 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5235 { 5236 struct vcpu_vmx *vmx = to_vmx(vcpu); 5237 5238 if (vmx->nested.current_vmptr == INVALID_GPA) 5239 return; 5240 5241 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5242 5243 if (enable_shadow_vmcs) { 5244 /* copy to memory all shadowed fields in case 5245 they were modified */ 5246 copy_shadow_to_vmcs12(vmx); 5247 vmx_disable_shadow_vmcs(vmx); 5248 } 5249 vmx->nested.posted_intr_nv = -1; 5250 5251 /* Flush VMCS12 to guest memory */ 5252 kvm_vcpu_write_guest_page(vcpu, 5253 vmx->nested.current_vmptr >> PAGE_SHIFT, 5254 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5255 5256 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5257 5258 vmx->nested.current_vmptr = INVALID_GPA; 5259 } 5260 5261 /* Emulate the VMXOFF instruction */ 5262 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5263 { 5264 if (!nested_vmx_check_permission(vcpu)) 5265 return 1; 5266 5267 free_nested(vcpu); 5268 5269 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5270 kvm_make_request(KVM_REQ_EVENT, vcpu); 5271 5272 return nested_vmx_succeed(vcpu); 5273 } 5274 5275 /* Emulate the VMCLEAR instruction */ 5276 static int handle_vmclear(struct kvm_vcpu *vcpu) 5277 { 5278 struct vcpu_vmx *vmx = to_vmx(vcpu); 5279 u32 zero = 0; 5280 gpa_t vmptr; 5281 int r; 5282 5283 if (!nested_vmx_check_permission(vcpu)) 5284 return 1; 5285 5286 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5287 return r; 5288 5289 if (!page_address_valid(vcpu, vmptr)) 5290 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5291 5292 if (vmptr == vmx->nested.vmxon_ptr) 5293 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5294 5295 /* 5296 * When Enlightened VMEntry is enabled on the calling CPU we treat 5297 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 5298 * way to distinguish it from VMCS12) and we must not corrupt it by 5299 * writing to the non-existent 'launch_state' field. The area doesn't 5300 * have to be the currently active EVMCS on the calling CPU and there's 5301 * nothing KVM has to do to transition it from 'active' to 'non-active' 5302 * state. It is possible that the area will stay mapped as 5303 * vmx->nested.hv_evmcs but this shouldn't be a problem. 5304 */ 5305 if (likely(!guest_cpuid_has_evmcs(vcpu) || 5306 !evmptr_is_valid(nested_get_evmptr(vcpu)))) { 5307 if (vmptr == vmx->nested.current_vmptr) 5308 nested_release_vmcs12(vcpu); 5309 5310 /* 5311 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5312 * for VMCLEAR includes a "ensure that data for VMCS referenced 5313 * by the operand is in memory" clause that guards writes to 5314 * memory, i.e. doing nothing for I/O is architecturally valid. 5315 * 5316 * FIXME: Suppress failures if and only if no memslot is found, 5317 * i.e. exit to userspace if __copy_to_user() fails. 5318 */ 5319 (void)kvm_vcpu_write_guest(vcpu, 5320 vmptr + offsetof(struct vmcs12, 5321 launch_state), 5322 &zero, sizeof(zero)); 5323 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { 5324 nested_release_evmcs(vcpu); 5325 } 5326 5327 return nested_vmx_succeed(vcpu); 5328 } 5329 5330 /* Emulate the VMLAUNCH instruction */ 5331 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5332 { 5333 return nested_vmx_run(vcpu, true); 5334 } 5335 5336 /* Emulate the VMRESUME instruction */ 5337 static int handle_vmresume(struct kvm_vcpu *vcpu) 5338 { 5339 5340 return nested_vmx_run(vcpu, false); 5341 } 5342 5343 static int handle_vmread(struct kvm_vcpu *vcpu) 5344 { 5345 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5346 : get_vmcs12(vcpu); 5347 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5348 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5349 struct vcpu_vmx *vmx = to_vmx(vcpu); 5350 struct x86_exception e; 5351 unsigned long field; 5352 u64 value; 5353 gva_t gva = 0; 5354 short offset; 5355 int len, r; 5356 5357 if (!nested_vmx_check_permission(vcpu)) 5358 return 1; 5359 5360 /* Decode instruction info and find the field to read */ 5361 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5362 5363 if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 5364 /* 5365 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5366 * any VMREAD sets the ALU flags for VMfailInvalid. 5367 */ 5368 if (vmx->nested.current_vmptr == INVALID_GPA || 5369 (is_guest_mode(vcpu) && 5370 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5371 return nested_vmx_failInvalid(vcpu); 5372 5373 offset = get_vmcs12_field_offset(field); 5374 if (offset < 0) 5375 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5376 5377 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5378 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5379 5380 /* Read the field, zero-extended to a u64 value */ 5381 value = vmcs12_read_any(vmcs12, field, offset); 5382 } else { 5383 /* 5384 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5385 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5386 * unsupported. Unfortunately, certain versions of Windows 11 5387 * don't comply with this requirement which is not enforced in 5388 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5389 * workaround, as misbehaving guests will panic on VM-Fail. 5390 * Note, enlightened VMCS is incompatible with shadow VMCS so 5391 * all VMREADs from L2 should go to L1. 5392 */ 5393 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5394 return nested_vmx_failInvalid(vcpu); 5395 5396 offset = evmcs_field_offset(field, NULL); 5397 if (offset < 0) 5398 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5399 5400 /* Read the field, zero-extended to a u64 value */ 5401 value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); 5402 } 5403 5404 /* 5405 * Now copy part of this value to register or memory, as requested. 5406 * Note that the number of bits actually copied is 32 or 64 depending 5407 * on the guest's mode (32 or 64 bit), not on the given field's length. 5408 */ 5409 if (instr_info & BIT(10)) { 5410 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5411 } else { 5412 len = is_64_bit_mode(vcpu) ? 8 : 4; 5413 if (get_vmx_mem_address(vcpu, exit_qualification, 5414 instr_info, true, len, &gva)) 5415 return 1; 5416 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5417 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5418 if (r != X86EMUL_CONTINUE) 5419 return kvm_handle_memory_failure(vcpu, r, &e); 5420 } 5421 5422 return nested_vmx_succeed(vcpu); 5423 } 5424 5425 static bool is_shadow_field_rw(unsigned long field) 5426 { 5427 switch (field) { 5428 #define SHADOW_FIELD_RW(x, y) case x: 5429 #include "vmcs_shadow_fields.h" 5430 return true; 5431 default: 5432 break; 5433 } 5434 return false; 5435 } 5436 5437 static bool is_shadow_field_ro(unsigned long field) 5438 { 5439 switch (field) { 5440 #define SHADOW_FIELD_RO(x, y) case x: 5441 #include "vmcs_shadow_fields.h" 5442 return true; 5443 default: 5444 break; 5445 } 5446 return false; 5447 } 5448 5449 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5450 { 5451 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5452 : get_vmcs12(vcpu); 5453 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5454 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5455 struct vcpu_vmx *vmx = to_vmx(vcpu); 5456 struct x86_exception e; 5457 unsigned long field; 5458 short offset; 5459 gva_t gva; 5460 int len, r; 5461 5462 /* 5463 * The value to write might be 32 or 64 bits, depending on L1's long 5464 * mode, and eventually we need to write that into a field of several 5465 * possible lengths. The code below first zero-extends the value to 64 5466 * bit (value), and then copies only the appropriate number of 5467 * bits into the vmcs12 field. 5468 */ 5469 u64 value = 0; 5470 5471 if (!nested_vmx_check_permission(vcpu)) 5472 return 1; 5473 5474 /* 5475 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5476 * any VMWRITE sets the ALU flags for VMfailInvalid. 5477 */ 5478 if (vmx->nested.current_vmptr == INVALID_GPA || 5479 (is_guest_mode(vcpu) && 5480 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5481 return nested_vmx_failInvalid(vcpu); 5482 5483 if (instr_info & BIT(10)) 5484 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5485 else { 5486 len = is_64_bit_mode(vcpu) ? 8 : 4; 5487 if (get_vmx_mem_address(vcpu, exit_qualification, 5488 instr_info, false, len, &gva)) 5489 return 1; 5490 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5491 if (r != X86EMUL_CONTINUE) 5492 return kvm_handle_memory_failure(vcpu, r, &e); 5493 } 5494 5495 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5496 5497 offset = get_vmcs12_field_offset(field); 5498 if (offset < 0) 5499 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5500 5501 /* 5502 * If the vCPU supports "VMWRITE to any supported field in the 5503 * VMCS," then the "read-only" fields are actually read/write. 5504 */ 5505 if (vmcs_field_readonly(field) && 5506 !nested_cpu_has_vmwrite_any_field(vcpu)) 5507 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5508 5509 /* 5510 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5511 * vmcs12, else we may crush a field or consume a stale value. 5512 */ 5513 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5514 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5515 5516 /* 5517 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5518 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5519 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5520 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5521 * from L1 will return a different value than VMREAD from L2 (L1 sees 5522 * the stripped down value, L2 sees the full value as stored by KVM). 5523 */ 5524 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5525 value &= 0x1f0ff; 5526 5527 vmcs12_write_any(vmcs12, field, offset, value); 5528 5529 /* 5530 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5531 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5532 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5533 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5534 */ 5535 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5536 /* 5537 * L1 can read these fields without exiting, ensure the 5538 * shadow VMCS is up-to-date. 5539 */ 5540 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5541 preempt_disable(); 5542 vmcs_load(vmx->vmcs01.shadow_vmcs); 5543 5544 __vmcs_writel(field, value); 5545 5546 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5547 vmcs_load(vmx->loaded_vmcs->vmcs); 5548 preempt_enable(); 5549 } 5550 vmx->nested.dirty_vmcs12 = true; 5551 } 5552 5553 return nested_vmx_succeed(vcpu); 5554 } 5555 5556 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5557 { 5558 vmx->nested.current_vmptr = vmptr; 5559 if (enable_shadow_vmcs) { 5560 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5561 vmcs_write64(VMCS_LINK_POINTER, 5562 __pa(vmx->vmcs01.shadow_vmcs)); 5563 vmx->nested.need_vmcs12_to_shadow_sync = true; 5564 } 5565 vmx->nested.dirty_vmcs12 = true; 5566 vmx->nested.force_msr_bitmap_recalc = true; 5567 } 5568 5569 /* Emulate the VMPTRLD instruction */ 5570 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5571 { 5572 struct vcpu_vmx *vmx = to_vmx(vcpu); 5573 gpa_t vmptr; 5574 int r; 5575 5576 if (!nested_vmx_check_permission(vcpu)) 5577 return 1; 5578 5579 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5580 return r; 5581 5582 if (!page_address_valid(vcpu, vmptr)) 5583 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5584 5585 if (vmptr == vmx->nested.vmxon_ptr) 5586 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5587 5588 /* Forbid normal VMPTRLD if Enlightened version was used */ 5589 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 5590 return 1; 5591 5592 if (vmx->nested.current_vmptr != vmptr) { 5593 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5594 struct vmcs_hdr hdr; 5595 5596 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5597 /* 5598 * Reads from an unbacked page return all 1s, 5599 * which means that the 32 bits located at the 5600 * given physical address won't match the required 5601 * VMCS12_REVISION identifier. 5602 */ 5603 return nested_vmx_fail(vcpu, 5604 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5605 } 5606 5607 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5608 offsetof(struct vmcs12, hdr), 5609 sizeof(hdr))) { 5610 return nested_vmx_fail(vcpu, 5611 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5612 } 5613 5614 if (hdr.revision_id != VMCS12_REVISION || 5615 (hdr.shadow_vmcs && 5616 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5617 return nested_vmx_fail(vcpu, 5618 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5619 } 5620 5621 nested_release_vmcs12(vcpu); 5622 5623 /* 5624 * Load VMCS12 from guest memory since it is not already 5625 * cached. 5626 */ 5627 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5628 VMCS12_SIZE)) { 5629 return nested_vmx_fail(vcpu, 5630 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5631 } 5632 5633 set_current_vmptr(vmx, vmptr); 5634 } 5635 5636 return nested_vmx_succeed(vcpu); 5637 } 5638 5639 /* Emulate the VMPTRST instruction */ 5640 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5641 { 5642 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5643 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5644 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5645 struct x86_exception e; 5646 gva_t gva; 5647 int r; 5648 5649 if (!nested_vmx_check_permission(vcpu)) 5650 return 1; 5651 5652 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) 5653 return 1; 5654 5655 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5656 true, sizeof(gpa_t), &gva)) 5657 return 1; 5658 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5659 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5660 sizeof(gpa_t), &e); 5661 if (r != X86EMUL_CONTINUE) 5662 return kvm_handle_memory_failure(vcpu, r, &e); 5663 5664 return nested_vmx_succeed(vcpu); 5665 } 5666 5667 /* Emulate the INVEPT instruction */ 5668 static int handle_invept(struct kvm_vcpu *vcpu) 5669 { 5670 struct vcpu_vmx *vmx = to_vmx(vcpu); 5671 u32 vmx_instruction_info, types; 5672 unsigned long type, roots_to_free; 5673 struct kvm_mmu *mmu; 5674 gva_t gva; 5675 struct x86_exception e; 5676 struct { 5677 u64 eptp, gpa; 5678 } operand; 5679 int i, r, gpr_index; 5680 5681 if (!(vmx->nested.msrs.secondary_ctls_high & 5682 SECONDARY_EXEC_ENABLE_EPT) || 5683 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5684 kvm_queue_exception(vcpu, UD_VECTOR); 5685 return 1; 5686 } 5687 5688 if (!nested_vmx_check_permission(vcpu)) 5689 return 1; 5690 5691 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5692 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5693 type = kvm_register_read(vcpu, gpr_index); 5694 5695 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5696 5697 if (type >= 32 || !(types & (1 << type))) 5698 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5699 5700 /* According to the Intel VMX instruction reference, the memory 5701 * operand is read even if it isn't needed (e.g., for type==global) 5702 */ 5703 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5704 vmx_instruction_info, false, sizeof(operand), &gva)) 5705 return 1; 5706 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5707 if (r != X86EMUL_CONTINUE) 5708 return kvm_handle_memory_failure(vcpu, r, &e); 5709 5710 /* 5711 * Nested EPT roots are always held through guest_mmu, 5712 * not root_mmu. 5713 */ 5714 mmu = &vcpu->arch.guest_mmu; 5715 5716 switch (type) { 5717 case VMX_EPT_EXTENT_CONTEXT: 5718 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5719 return nested_vmx_fail(vcpu, 5720 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5721 5722 roots_to_free = 0; 5723 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5724 operand.eptp)) 5725 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5726 5727 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5728 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5729 mmu->prev_roots[i].pgd, 5730 operand.eptp)) 5731 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5732 } 5733 break; 5734 case VMX_EPT_EXTENT_GLOBAL: 5735 roots_to_free = KVM_MMU_ROOTS_ALL; 5736 break; 5737 default: 5738 BUG(); 5739 break; 5740 } 5741 5742 if (roots_to_free) 5743 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5744 5745 return nested_vmx_succeed(vcpu); 5746 } 5747 5748 static int handle_invvpid(struct kvm_vcpu *vcpu) 5749 { 5750 struct vcpu_vmx *vmx = to_vmx(vcpu); 5751 u32 vmx_instruction_info; 5752 unsigned long type, types; 5753 gva_t gva; 5754 struct x86_exception e; 5755 struct { 5756 u64 vpid; 5757 u64 gla; 5758 } operand; 5759 u16 vpid02; 5760 int r, gpr_index; 5761 5762 if (!(vmx->nested.msrs.secondary_ctls_high & 5763 SECONDARY_EXEC_ENABLE_VPID) || 5764 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5765 kvm_queue_exception(vcpu, UD_VECTOR); 5766 return 1; 5767 } 5768 5769 if (!nested_vmx_check_permission(vcpu)) 5770 return 1; 5771 5772 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5773 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5774 type = kvm_register_read(vcpu, gpr_index); 5775 5776 types = (vmx->nested.msrs.vpid_caps & 5777 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5778 5779 if (type >= 32 || !(types & (1 << type))) 5780 return nested_vmx_fail(vcpu, 5781 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5782 5783 /* according to the intel vmx instruction reference, the memory 5784 * operand is read even if it isn't needed (e.g., for type==global) 5785 */ 5786 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5787 vmx_instruction_info, false, sizeof(operand), &gva)) 5788 return 1; 5789 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5790 if (r != X86EMUL_CONTINUE) 5791 return kvm_handle_memory_failure(vcpu, r, &e); 5792 5793 if (operand.vpid >> 16) 5794 return nested_vmx_fail(vcpu, 5795 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5796 5797 vpid02 = nested_get_vpid02(vcpu); 5798 switch (type) { 5799 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5800 if (!operand.vpid || 5801 is_noncanonical_address(operand.gla, vcpu)) 5802 return nested_vmx_fail(vcpu, 5803 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5804 vpid_sync_vcpu_addr(vpid02, operand.gla); 5805 break; 5806 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5807 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5808 if (!operand.vpid) 5809 return nested_vmx_fail(vcpu, 5810 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5811 vpid_sync_context(vpid02); 5812 break; 5813 case VMX_VPID_EXTENT_ALL_CONTEXT: 5814 vpid_sync_context(vpid02); 5815 break; 5816 default: 5817 WARN_ON_ONCE(1); 5818 return kvm_skip_emulated_instruction(vcpu); 5819 } 5820 5821 /* 5822 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5823 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5824 * roots as VPIDs are not tracked in the MMU role. 5825 * 5826 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5827 * an MMU when EPT is disabled. 5828 * 5829 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5830 */ 5831 if (!enable_ept) 5832 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 5833 5834 return nested_vmx_succeed(vcpu); 5835 } 5836 5837 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5838 struct vmcs12 *vmcs12) 5839 { 5840 u32 index = kvm_rcx_read(vcpu); 5841 u64 new_eptp; 5842 5843 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5844 return 1; 5845 if (index >= VMFUNC_EPTP_ENTRIES) 5846 return 1; 5847 5848 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5849 &new_eptp, index * 8, 8)) 5850 return 1; 5851 5852 /* 5853 * If the (L2) guest does a vmfunc to the currently 5854 * active ept pointer, we don't have to do anything else 5855 */ 5856 if (vmcs12->ept_pointer != new_eptp) { 5857 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5858 return 1; 5859 5860 vmcs12->ept_pointer = new_eptp; 5861 nested_ept_new_eptp(vcpu); 5862 5863 if (!nested_cpu_has_vpid(vmcs12)) 5864 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5865 } 5866 5867 return 0; 5868 } 5869 5870 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5871 { 5872 struct vcpu_vmx *vmx = to_vmx(vcpu); 5873 struct vmcs12 *vmcs12; 5874 u32 function = kvm_rax_read(vcpu); 5875 5876 /* 5877 * VMFUNC should never execute cleanly while L1 is active; KVM supports 5878 * VMFUNC for nested VMs, but not for L1. 5879 */ 5880 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 5881 kvm_queue_exception(vcpu, UD_VECTOR); 5882 return 1; 5883 } 5884 5885 vmcs12 = get_vmcs12(vcpu); 5886 5887 /* 5888 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5889 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5890 */ 5891 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5892 kvm_queue_exception(vcpu, UD_VECTOR); 5893 return 1; 5894 } 5895 5896 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5897 goto fail; 5898 5899 switch (function) { 5900 case 0: 5901 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5902 goto fail; 5903 break; 5904 default: 5905 goto fail; 5906 } 5907 return kvm_skip_emulated_instruction(vcpu); 5908 5909 fail: 5910 /* 5911 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5912 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5913 * EXIT_REASON_VMFUNC as the exit reason. 5914 */ 5915 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5916 vmx_get_intr_info(vcpu), 5917 vmx_get_exit_qual(vcpu)); 5918 return 1; 5919 } 5920 5921 /* 5922 * Return true if an IO instruction with the specified port and size should cause 5923 * a VM-exit into L1. 5924 */ 5925 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5926 int size) 5927 { 5928 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5929 gpa_t bitmap, last_bitmap; 5930 u8 b; 5931 5932 last_bitmap = INVALID_GPA; 5933 b = -1; 5934 5935 while (size > 0) { 5936 if (port < 0x8000) 5937 bitmap = vmcs12->io_bitmap_a; 5938 else if (port < 0x10000) 5939 bitmap = vmcs12->io_bitmap_b; 5940 else 5941 return true; 5942 bitmap += (port & 0x7fff) / 8; 5943 5944 if (last_bitmap != bitmap) 5945 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5946 return true; 5947 if (b & (1 << (port & 7))) 5948 return true; 5949 5950 port++; 5951 size--; 5952 last_bitmap = bitmap; 5953 } 5954 5955 return false; 5956 } 5957 5958 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5959 struct vmcs12 *vmcs12) 5960 { 5961 unsigned long exit_qualification; 5962 unsigned short port; 5963 int size; 5964 5965 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5966 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5967 5968 exit_qualification = vmx_get_exit_qual(vcpu); 5969 5970 port = exit_qualification >> 16; 5971 size = (exit_qualification & 7) + 1; 5972 5973 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5974 } 5975 5976 /* 5977 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5978 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5979 * disinterest in the current event (read or write a specific MSR) by using an 5980 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5981 */ 5982 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5983 struct vmcs12 *vmcs12, 5984 union vmx_exit_reason exit_reason) 5985 { 5986 u32 msr_index = kvm_rcx_read(vcpu); 5987 gpa_t bitmap; 5988 5989 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5990 return true; 5991 5992 /* 5993 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5994 * for the four combinations of read/write and low/high MSR numbers. 5995 * First we need to figure out which of the four to use: 5996 */ 5997 bitmap = vmcs12->msr_bitmap; 5998 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5999 bitmap += 2048; 6000 if (msr_index >= 0xc0000000) { 6001 msr_index -= 0xc0000000; 6002 bitmap += 1024; 6003 } 6004 6005 /* Then read the msr_index'th bit from this bitmap: */ 6006 if (msr_index < 1024*8) { 6007 unsigned char b; 6008 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6009 return true; 6010 return 1 & (b >> (msr_index & 7)); 6011 } else 6012 return true; /* let L1 handle the wrong parameter */ 6013 } 6014 6015 /* 6016 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6017 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6018 * intercept (via guest_host_mask etc.) the current event. 6019 */ 6020 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6021 struct vmcs12 *vmcs12) 6022 { 6023 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6024 int cr = exit_qualification & 15; 6025 int reg; 6026 unsigned long val; 6027 6028 switch ((exit_qualification >> 4) & 3) { 6029 case 0: /* mov to cr */ 6030 reg = (exit_qualification >> 8) & 15; 6031 val = kvm_register_read(vcpu, reg); 6032 switch (cr) { 6033 case 0: 6034 if (vmcs12->cr0_guest_host_mask & 6035 (val ^ vmcs12->cr0_read_shadow)) 6036 return true; 6037 break; 6038 case 3: 6039 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6040 return true; 6041 break; 6042 case 4: 6043 if (vmcs12->cr4_guest_host_mask & 6044 (vmcs12->cr4_read_shadow ^ val)) 6045 return true; 6046 break; 6047 case 8: 6048 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6049 return true; 6050 break; 6051 } 6052 break; 6053 case 2: /* clts */ 6054 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6055 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6056 return true; 6057 break; 6058 case 1: /* mov from cr */ 6059 switch (cr) { 6060 case 3: 6061 if (vmcs12->cpu_based_vm_exec_control & 6062 CPU_BASED_CR3_STORE_EXITING) 6063 return true; 6064 break; 6065 case 8: 6066 if (vmcs12->cpu_based_vm_exec_control & 6067 CPU_BASED_CR8_STORE_EXITING) 6068 return true; 6069 break; 6070 } 6071 break; 6072 case 3: /* lmsw */ 6073 /* 6074 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6075 * cr0. Other attempted changes are ignored, with no exit. 6076 */ 6077 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6078 if (vmcs12->cr0_guest_host_mask & 0xe & 6079 (val ^ vmcs12->cr0_read_shadow)) 6080 return true; 6081 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6082 !(vmcs12->cr0_read_shadow & 0x1) && 6083 (val & 0x1)) 6084 return true; 6085 break; 6086 } 6087 return false; 6088 } 6089 6090 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6091 struct vmcs12 *vmcs12) 6092 { 6093 u32 encls_leaf; 6094 6095 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 6096 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6097 return false; 6098 6099 encls_leaf = kvm_rax_read(vcpu); 6100 if (encls_leaf > 62) 6101 encls_leaf = 63; 6102 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6103 } 6104 6105 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6106 struct vmcs12 *vmcs12, gpa_t bitmap) 6107 { 6108 u32 vmx_instruction_info; 6109 unsigned long field; 6110 u8 b; 6111 6112 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6113 return true; 6114 6115 /* Decode instruction info and find the field to access */ 6116 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6117 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6118 6119 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6120 if (field >> 15) 6121 return true; 6122 6123 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6124 return true; 6125 6126 return 1 & (b >> (field & 7)); 6127 } 6128 6129 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6130 { 6131 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6132 6133 if (nested_cpu_has_mtf(vmcs12)) 6134 return true; 6135 6136 /* 6137 * An MTF VM-exit may be injected into the guest by setting the 6138 * interruption-type to 7 (other event) and the vector field to 0. Such 6139 * is the case regardless of the 'monitor trap flag' VM-execution 6140 * control. 6141 */ 6142 return entry_intr_info == (INTR_INFO_VALID_MASK 6143 | INTR_TYPE_OTHER_EVENT); 6144 } 6145 6146 /* 6147 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6148 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6149 */ 6150 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6151 union vmx_exit_reason exit_reason) 6152 { 6153 u32 intr_info; 6154 6155 switch ((u16)exit_reason.basic) { 6156 case EXIT_REASON_EXCEPTION_NMI: 6157 intr_info = vmx_get_intr_info(vcpu); 6158 if (is_nmi(intr_info)) 6159 return true; 6160 else if (is_page_fault(intr_info)) 6161 return vcpu->arch.apf.host_apf_flags || 6162 vmx_need_pf_intercept(vcpu); 6163 else if (is_debug(intr_info) && 6164 vcpu->guest_debug & 6165 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6166 return true; 6167 else if (is_breakpoint(intr_info) && 6168 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6169 return true; 6170 else if (is_alignment_check(intr_info) && 6171 !vmx_guest_inject_ac(vcpu)) 6172 return true; 6173 return false; 6174 case EXIT_REASON_EXTERNAL_INTERRUPT: 6175 return true; 6176 case EXIT_REASON_MCE_DURING_VMENTRY: 6177 return true; 6178 case EXIT_REASON_EPT_VIOLATION: 6179 /* 6180 * L0 always deals with the EPT violation. If nested EPT is 6181 * used, and the nested mmu code discovers that the address is 6182 * missing in the guest EPT table (EPT12), the EPT violation 6183 * will be injected with nested_ept_inject_page_fault() 6184 */ 6185 return true; 6186 case EXIT_REASON_EPT_MISCONFIG: 6187 /* 6188 * L2 never uses directly L1's EPT, but rather L0's own EPT 6189 * table (shadow on EPT) or a merged EPT table that L0 built 6190 * (EPT on EPT). So any problems with the structure of the 6191 * table is L0's fault. 6192 */ 6193 return true; 6194 case EXIT_REASON_PREEMPTION_TIMER: 6195 return true; 6196 case EXIT_REASON_PML_FULL: 6197 /* 6198 * PML is emulated for an L1 VMM and should never be enabled in 6199 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6200 */ 6201 return true; 6202 case EXIT_REASON_VMFUNC: 6203 /* VM functions are emulated through L2->L0 vmexits. */ 6204 return true; 6205 case EXIT_REASON_BUS_LOCK: 6206 /* 6207 * At present, bus lock VM exit is never exposed to L1. 6208 * Handle L2's bus locks in L0 directly. 6209 */ 6210 return true; 6211 case EXIT_REASON_VMCALL: 6212 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6213 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6214 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6215 kvm_hv_is_tlb_flush_hcall(vcpu); 6216 default: 6217 break; 6218 } 6219 return false; 6220 } 6221 6222 /* 6223 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6224 * is_guest_mode (L2). 6225 */ 6226 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6227 union vmx_exit_reason exit_reason) 6228 { 6229 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6230 u32 intr_info; 6231 6232 switch ((u16)exit_reason.basic) { 6233 case EXIT_REASON_EXCEPTION_NMI: 6234 intr_info = vmx_get_intr_info(vcpu); 6235 if (is_nmi(intr_info)) 6236 return true; 6237 else if (is_page_fault(intr_info)) 6238 return true; 6239 return vmcs12->exception_bitmap & 6240 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6241 case EXIT_REASON_EXTERNAL_INTERRUPT: 6242 return nested_exit_on_intr(vcpu); 6243 case EXIT_REASON_TRIPLE_FAULT: 6244 return true; 6245 case EXIT_REASON_INTERRUPT_WINDOW: 6246 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6247 case EXIT_REASON_NMI_WINDOW: 6248 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6249 case EXIT_REASON_TASK_SWITCH: 6250 return true; 6251 case EXIT_REASON_CPUID: 6252 return true; 6253 case EXIT_REASON_HLT: 6254 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6255 case EXIT_REASON_INVD: 6256 return true; 6257 case EXIT_REASON_INVLPG: 6258 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6259 case EXIT_REASON_RDPMC: 6260 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6261 case EXIT_REASON_RDRAND: 6262 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6263 case EXIT_REASON_RDSEED: 6264 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6265 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6266 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6267 case EXIT_REASON_VMREAD: 6268 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6269 vmcs12->vmread_bitmap); 6270 case EXIT_REASON_VMWRITE: 6271 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6272 vmcs12->vmwrite_bitmap); 6273 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6274 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6275 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6276 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6277 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6278 /* 6279 * VMX instructions trap unconditionally. This allows L1 to 6280 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6281 */ 6282 return true; 6283 case EXIT_REASON_CR_ACCESS: 6284 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6285 case EXIT_REASON_DR_ACCESS: 6286 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6287 case EXIT_REASON_IO_INSTRUCTION: 6288 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6289 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6290 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6291 case EXIT_REASON_MSR_READ: 6292 case EXIT_REASON_MSR_WRITE: 6293 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6294 case EXIT_REASON_INVALID_STATE: 6295 return true; 6296 case EXIT_REASON_MWAIT_INSTRUCTION: 6297 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6298 case EXIT_REASON_MONITOR_TRAP_FLAG: 6299 return nested_vmx_exit_handled_mtf(vmcs12); 6300 case EXIT_REASON_MONITOR_INSTRUCTION: 6301 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6302 case EXIT_REASON_PAUSE_INSTRUCTION: 6303 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6304 nested_cpu_has2(vmcs12, 6305 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6306 case EXIT_REASON_MCE_DURING_VMENTRY: 6307 return true; 6308 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6309 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6310 case EXIT_REASON_APIC_ACCESS: 6311 case EXIT_REASON_APIC_WRITE: 6312 case EXIT_REASON_EOI_INDUCED: 6313 /* 6314 * The controls for "virtualize APIC accesses," "APIC- 6315 * register virtualization," and "virtual-interrupt 6316 * delivery" only come from vmcs12. 6317 */ 6318 return true; 6319 case EXIT_REASON_INVPCID: 6320 return 6321 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6322 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6323 case EXIT_REASON_WBINVD: 6324 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6325 case EXIT_REASON_XSETBV: 6326 return true; 6327 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6328 /* 6329 * This should never happen, since it is not possible to 6330 * set XSS to a non-zero value---neither in L1 nor in L2. 6331 * If if it were, XSS would have to be checked against 6332 * the XSS exit bitmap in vmcs12. 6333 */ 6334 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6335 case EXIT_REASON_UMWAIT: 6336 case EXIT_REASON_TPAUSE: 6337 return nested_cpu_has2(vmcs12, 6338 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6339 case EXIT_REASON_ENCLS: 6340 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6341 case EXIT_REASON_NOTIFY: 6342 /* Notify VM exit is not exposed to L1 */ 6343 return false; 6344 default: 6345 return true; 6346 } 6347 } 6348 6349 /* 6350 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6351 * reflected into L1. 6352 */ 6353 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6354 { 6355 struct vcpu_vmx *vmx = to_vmx(vcpu); 6356 union vmx_exit_reason exit_reason = vmx->exit_reason; 6357 unsigned long exit_qual; 6358 u32 exit_intr_info; 6359 6360 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6361 6362 /* 6363 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6364 * has already loaded L2's state. 6365 */ 6366 if (unlikely(vmx->fail)) { 6367 trace_kvm_nested_vmenter_failed( 6368 "hardware VM-instruction error: ", 6369 vmcs_read32(VM_INSTRUCTION_ERROR)); 6370 exit_intr_info = 0; 6371 exit_qual = 0; 6372 goto reflect_vmexit; 6373 } 6374 6375 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6376 6377 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6378 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6379 return false; 6380 6381 /* If L1 doesn't want the exit, handle it in L0. */ 6382 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6383 return false; 6384 6385 /* 6386 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6387 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6388 * need to be synthesized by querying the in-kernel LAPIC, but external 6389 * interrupts are never reflected to L1 so it's a non-issue. 6390 */ 6391 exit_intr_info = vmx_get_intr_info(vcpu); 6392 if (is_exception_with_error_code(exit_intr_info)) { 6393 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6394 6395 vmcs12->vm_exit_intr_error_code = 6396 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6397 } 6398 exit_qual = vmx_get_exit_qual(vcpu); 6399 6400 reflect_vmexit: 6401 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6402 return true; 6403 } 6404 6405 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6406 struct kvm_nested_state __user *user_kvm_nested_state, 6407 u32 user_data_size) 6408 { 6409 struct vcpu_vmx *vmx; 6410 struct vmcs12 *vmcs12; 6411 struct kvm_nested_state kvm_state = { 6412 .flags = 0, 6413 .format = KVM_STATE_NESTED_FORMAT_VMX, 6414 .size = sizeof(kvm_state), 6415 .hdr.vmx.flags = 0, 6416 .hdr.vmx.vmxon_pa = INVALID_GPA, 6417 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6418 .hdr.vmx.preemption_timer_deadline = 0, 6419 }; 6420 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6421 &user_kvm_nested_state->data.vmx[0]; 6422 6423 if (!vcpu) 6424 return kvm_state.size + sizeof(*user_vmx_nested_state); 6425 6426 vmx = to_vmx(vcpu); 6427 vmcs12 = get_vmcs12(vcpu); 6428 6429 if (guest_can_use(vcpu, X86_FEATURE_VMX) && 6430 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6431 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6432 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6433 6434 if (vmx_has_valid_vmcs12(vcpu)) { 6435 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6436 6437 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6438 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 6439 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6440 6441 if (is_guest_mode(vcpu) && 6442 nested_cpu_has_shadow_vmcs(vmcs12) && 6443 vmcs12->vmcs_link_pointer != INVALID_GPA) 6444 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6445 } 6446 6447 if (vmx->nested.smm.vmxon) 6448 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6449 6450 if (vmx->nested.smm.guest_mode) 6451 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6452 6453 if (is_guest_mode(vcpu)) { 6454 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6455 6456 if (vmx->nested.nested_run_pending) 6457 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6458 6459 if (vmx->nested.mtf_pending) 6460 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6461 6462 if (nested_cpu_has_preemption_timer(vmcs12) && 6463 vmx->nested.has_preemption_timer_deadline) { 6464 kvm_state.hdr.vmx.flags |= 6465 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6466 kvm_state.hdr.vmx.preemption_timer_deadline = 6467 vmx->nested.preemption_timer_deadline; 6468 } 6469 } 6470 } 6471 6472 if (user_data_size < kvm_state.size) 6473 goto out; 6474 6475 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6476 return -EFAULT; 6477 6478 if (!vmx_has_valid_vmcs12(vcpu)) 6479 goto out; 6480 6481 /* 6482 * When running L2, the authoritative vmcs12 state is in the 6483 * vmcs02. When running L1, the authoritative vmcs12 state is 6484 * in the shadow or enlightened vmcs linked to vmcs01, unless 6485 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6486 * vmcs12 state is in the vmcs12 already. 6487 */ 6488 if (is_guest_mode(vcpu)) { 6489 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6490 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6491 } else { 6492 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6493 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6494 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 6495 /* 6496 * L1 hypervisor is not obliged to keep eVMCS 6497 * clean fields data always up-to-date while 6498 * not in guest mode, 'hv_clean_fields' is only 6499 * supposed to be actual upon vmentry so we need 6500 * to ignore it here and do full copy. 6501 */ 6502 copy_enlightened_to_vmcs12(vmx, 0); 6503 else if (enable_shadow_vmcs) 6504 copy_shadow_to_vmcs12(vmx); 6505 } 6506 } 6507 6508 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6509 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6510 6511 /* 6512 * Copy over the full allocated size of vmcs12 rather than just the size 6513 * of the struct. 6514 */ 6515 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6516 return -EFAULT; 6517 6518 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6519 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6520 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6521 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6522 return -EFAULT; 6523 } 6524 out: 6525 return kvm_state.size; 6526 } 6527 6528 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6529 { 6530 if (is_guest_mode(vcpu)) { 6531 to_vmx(vcpu)->nested.nested_run_pending = 0; 6532 nested_vmx_vmexit(vcpu, -1, 0, 0); 6533 } 6534 free_nested(vcpu); 6535 } 6536 6537 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6538 struct kvm_nested_state __user *user_kvm_nested_state, 6539 struct kvm_nested_state *kvm_state) 6540 { 6541 struct vcpu_vmx *vmx = to_vmx(vcpu); 6542 struct vmcs12 *vmcs12; 6543 enum vm_entry_failure_code ignored; 6544 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6545 &user_kvm_nested_state->data.vmx[0]; 6546 int ret; 6547 6548 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6549 return -EINVAL; 6550 6551 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6552 if (kvm_state->hdr.vmx.smm.flags) 6553 return -EINVAL; 6554 6555 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6556 return -EINVAL; 6557 6558 /* 6559 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6560 * enable eVMCS capability on vCPU. However, since then 6561 * code was changed such that flag signals vmcs12 should 6562 * be copied into eVMCS in guest memory. 6563 * 6564 * To preserve backwards compatability, allow user 6565 * to set this flag even when there is no VMXON region. 6566 */ 6567 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6568 return -EINVAL; 6569 } else { 6570 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 6571 return -EINVAL; 6572 6573 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6574 return -EINVAL; 6575 } 6576 6577 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6578 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6579 return -EINVAL; 6580 6581 if (kvm_state->hdr.vmx.smm.flags & 6582 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6583 return -EINVAL; 6584 6585 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6586 return -EINVAL; 6587 6588 /* 6589 * SMM temporarily disables VMX, so we cannot be in guest mode, 6590 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6591 * must be zero. 6592 */ 6593 if (is_smm(vcpu) ? 6594 (kvm_state->flags & 6595 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6596 : kvm_state->hdr.vmx.smm.flags) 6597 return -EINVAL; 6598 6599 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6600 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6601 return -EINVAL; 6602 6603 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6604 (!guest_can_use(vcpu, X86_FEATURE_VMX) || 6605 !vmx->nested.enlightened_vmcs_enabled)) 6606 return -EINVAL; 6607 6608 vmx_leave_nested(vcpu); 6609 6610 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6611 return 0; 6612 6613 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6614 ret = enter_vmx_operation(vcpu); 6615 if (ret) 6616 return ret; 6617 6618 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6619 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6620 /* See vmx_has_valid_vmcs12. */ 6621 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6622 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6623 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6624 return -EINVAL; 6625 else 6626 return 0; 6627 } 6628 6629 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6630 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6631 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6632 return -EINVAL; 6633 6634 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6635 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6636 /* 6637 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6638 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6639 * restored yet. EVMCS will be mapped from 6640 * nested_get_vmcs12_pages(). 6641 */ 6642 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6643 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6644 } else { 6645 return -EINVAL; 6646 } 6647 6648 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6649 vmx->nested.smm.vmxon = true; 6650 vmx->nested.vmxon = false; 6651 6652 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6653 vmx->nested.smm.guest_mode = true; 6654 } 6655 6656 vmcs12 = get_vmcs12(vcpu); 6657 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6658 return -EFAULT; 6659 6660 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6661 return -EINVAL; 6662 6663 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6664 return 0; 6665 6666 vmx->nested.nested_run_pending = 6667 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6668 6669 vmx->nested.mtf_pending = 6670 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6671 6672 ret = -EINVAL; 6673 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6674 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6675 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6676 6677 if (kvm_state->size < 6678 sizeof(*kvm_state) + 6679 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6680 goto error_guest_mode; 6681 6682 if (copy_from_user(shadow_vmcs12, 6683 user_vmx_nested_state->shadow_vmcs12, 6684 sizeof(*shadow_vmcs12))) { 6685 ret = -EFAULT; 6686 goto error_guest_mode; 6687 } 6688 6689 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6690 !shadow_vmcs12->hdr.shadow_vmcs) 6691 goto error_guest_mode; 6692 } 6693 6694 vmx->nested.has_preemption_timer_deadline = false; 6695 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6696 vmx->nested.has_preemption_timer_deadline = true; 6697 vmx->nested.preemption_timer_deadline = 6698 kvm_state->hdr.vmx.preemption_timer_deadline; 6699 } 6700 6701 if (nested_vmx_check_controls(vcpu, vmcs12) || 6702 nested_vmx_check_host_state(vcpu, vmcs12) || 6703 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6704 goto error_guest_mode; 6705 6706 vmx->nested.dirty_vmcs12 = true; 6707 vmx->nested.force_msr_bitmap_recalc = true; 6708 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6709 if (ret) 6710 goto error_guest_mode; 6711 6712 if (vmx->nested.mtf_pending) 6713 kvm_make_request(KVM_REQ_EVENT, vcpu); 6714 6715 return 0; 6716 6717 error_guest_mode: 6718 vmx->nested.nested_run_pending = 0; 6719 return ret; 6720 } 6721 6722 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6723 { 6724 if (enable_shadow_vmcs) { 6725 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6726 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6727 } 6728 } 6729 6730 /* 6731 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6732 * that madness to get the encoding for comparison. 6733 */ 6734 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6735 6736 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6737 { 6738 /* 6739 * Note these are the so called "index" of the VMCS field encoding, not 6740 * the index into vmcs12. 6741 */ 6742 unsigned int max_idx, idx; 6743 int i; 6744 6745 /* 6746 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6747 * vmcs12, regardless of whether or not the associated feature is 6748 * exposed to L1. Simply find the field with the highest index. 6749 */ 6750 max_idx = 0; 6751 for (i = 0; i < nr_vmcs12_fields; i++) { 6752 /* The vmcs12 table is very, very sparsely populated. */ 6753 if (!vmcs12_field_offsets[i]) 6754 continue; 6755 6756 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6757 if (idx > max_idx) 6758 max_idx = idx; 6759 } 6760 6761 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6762 } 6763 6764 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 6765 struct nested_vmx_msrs *msrs) 6766 { 6767 msrs->pinbased_ctls_low = 6768 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6769 6770 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 6771 msrs->pinbased_ctls_high &= 6772 PIN_BASED_EXT_INTR_MASK | 6773 PIN_BASED_NMI_EXITING | 6774 PIN_BASED_VIRTUAL_NMIS | 6775 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6776 msrs->pinbased_ctls_high |= 6777 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6778 PIN_BASED_VMX_PREEMPTION_TIMER; 6779 } 6780 6781 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 6782 struct nested_vmx_msrs *msrs) 6783 { 6784 msrs->exit_ctls_low = 6785 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6786 6787 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 6788 msrs->exit_ctls_high &= 6789 #ifdef CONFIG_X86_64 6790 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6791 #endif 6792 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6793 VM_EXIT_CLEAR_BNDCFGS; 6794 msrs->exit_ctls_high |= 6795 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6796 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6797 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 6798 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6799 6800 /* We support free control of debug control saving. */ 6801 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6802 } 6803 6804 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 6805 struct nested_vmx_msrs *msrs) 6806 { 6807 msrs->entry_ctls_low = 6808 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6809 6810 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 6811 msrs->entry_ctls_high &= 6812 #ifdef CONFIG_X86_64 6813 VM_ENTRY_IA32E_MODE | 6814 #endif 6815 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 6816 msrs->entry_ctls_high |= 6817 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 6818 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 6819 6820 /* We support free control of debug control loading. */ 6821 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6822 } 6823 6824 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 6825 struct nested_vmx_msrs *msrs) 6826 { 6827 msrs->procbased_ctls_low = 6828 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6829 6830 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 6831 msrs->procbased_ctls_high &= 6832 CPU_BASED_INTR_WINDOW_EXITING | 6833 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6834 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6835 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6836 CPU_BASED_CR3_STORE_EXITING | 6837 #ifdef CONFIG_X86_64 6838 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6839 #endif 6840 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6841 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6842 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6843 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6844 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6845 /* 6846 * We can allow some features even when not supported by the 6847 * hardware. For example, L1 can specify an MSR bitmap - and we 6848 * can use it to avoid exits to L1 - even when L0 runs L2 6849 * without MSR bitmaps. 6850 */ 6851 msrs->procbased_ctls_high |= 6852 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6853 CPU_BASED_USE_MSR_BITMAPS; 6854 6855 /* We support free control of CR3 access interception. */ 6856 msrs->procbased_ctls_low &= 6857 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6858 } 6859 6860 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 6861 struct vmcs_config *vmcs_conf, 6862 struct nested_vmx_msrs *msrs) 6863 { 6864 msrs->secondary_ctls_low = 0; 6865 6866 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 6867 msrs->secondary_ctls_high &= 6868 SECONDARY_EXEC_DESC | 6869 SECONDARY_EXEC_ENABLE_RDTSCP | 6870 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6871 SECONDARY_EXEC_WBINVD_EXITING | 6872 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6873 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6874 SECONDARY_EXEC_RDRAND_EXITING | 6875 SECONDARY_EXEC_ENABLE_INVPCID | 6876 SECONDARY_EXEC_ENABLE_VMFUNC | 6877 SECONDARY_EXEC_RDSEED_EXITING | 6878 SECONDARY_EXEC_ENABLE_XSAVES | 6879 SECONDARY_EXEC_TSC_SCALING | 6880 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 6881 6882 /* 6883 * We can emulate "VMCS shadowing," even if the hardware 6884 * doesn't support it. 6885 */ 6886 msrs->secondary_ctls_high |= 6887 SECONDARY_EXEC_SHADOW_VMCS; 6888 6889 if (enable_ept) { 6890 /* nested EPT: emulate EPT also to L1 */ 6891 msrs->secondary_ctls_high |= 6892 SECONDARY_EXEC_ENABLE_EPT; 6893 msrs->ept_caps = 6894 VMX_EPT_PAGE_WALK_4_BIT | 6895 VMX_EPT_PAGE_WALK_5_BIT | 6896 VMX_EPTP_WB_BIT | 6897 VMX_EPT_INVEPT_BIT | 6898 VMX_EPT_EXECUTE_ONLY_BIT; 6899 6900 msrs->ept_caps &= ept_caps; 6901 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6902 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6903 VMX_EPT_1GB_PAGE_BIT; 6904 if (enable_ept_ad_bits) { 6905 msrs->secondary_ctls_high |= 6906 SECONDARY_EXEC_ENABLE_PML; 6907 msrs->ept_caps |= VMX_EPT_AD_BIT; 6908 } 6909 6910 /* 6911 * Advertise EPTP switching irrespective of hardware support, 6912 * KVM emulates it in software so long as VMFUNC is supported. 6913 */ 6914 if (cpu_has_vmx_vmfunc()) 6915 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 6916 } 6917 6918 /* 6919 * Old versions of KVM use the single-context version without 6920 * checking for support, so declare that it is supported even 6921 * though it is treated as global context. The alternative is 6922 * not failing the single-context invvpid, and it is worse. 6923 */ 6924 if (enable_vpid) { 6925 msrs->secondary_ctls_high |= 6926 SECONDARY_EXEC_ENABLE_VPID; 6927 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6928 VMX_VPID_EXTENT_SUPPORTED_MASK; 6929 } 6930 6931 if (enable_unrestricted_guest) 6932 msrs->secondary_ctls_high |= 6933 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6934 6935 if (flexpriority_enabled) 6936 msrs->secondary_ctls_high |= 6937 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6938 6939 if (enable_sgx) 6940 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6941 } 6942 6943 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 6944 struct nested_vmx_msrs *msrs) 6945 { 6946 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 6947 msrs->misc_low |= 6948 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6949 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6950 VMX_MISC_ACTIVITY_HLT | 6951 VMX_MISC_ACTIVITY_WAIT_SIPI; 6952 msrs->misc_high = 0; 6953 } 6954 6955 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 6956 { 6957 /* 6958 * This MSR reports some information about VMX support. We 6959 * should return information about the VMX we emulate for the 6960 * guest, and the VMCS structure we give it - not about the 6961 * VMX support of the underlying hardware. 6962 */ 6963 msrs->basic = 6964 VMCS12_REVISION | 6965 VMX_BASIC_TRUE_CTLS | 6966 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6967 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6968 6969 if (cpu_has_vmx_basic_inout()) 6970 msrs->basic |= VMX_BASIC_INOUT; 6971 } 6972 6973 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 6974 { 6975 /* 6976 * These MSRs specify bits which the guest must keep fixed on 6977 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6978 * We picked the standard core2 setting. 6979 */ 6980 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6981 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6982 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6983 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6984 6985 /* These MSRs specify bits which the guest must keep fixed off. */ 6986 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6987 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6988 6989 if (vmx_umip_emulated()) 6990 msrs->cr4_fixed1 |= X86_CR4_UMIP; 6991 } 6992 6993 /* 6994 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6995 * returned for the various VMX controls MSRs when nested VMX is enabled. 6996 * The same values should also be used to verify that vmcs12 control fields are 6997 * valid during nested entry from L1 to L2. 6998 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6999 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7000 * bit in the high half is on if the corresponding bit in the control field 7001 * may be on. See also vmx_control_verify(). 7002 */ 7003 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7004 { 7005 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7006 7007 /* 7008 * Note that as a general rule, the high half of the MSRs (bits in 7009 * the control fields which may be 1) should be initialized by the 7010 * intersection of the underlying hardware's MSR (i.e., features which 7011 * can be supported) and the list of features we want to expose - 7012 * because they are known to be properly supported in our code. 7013 * Also, usually, the low half of the MSRs (bits which must be 1) can 7014 * be set to 0, meaning that L1 may turn off any of these bits. The 7015 * reason is that if one of these bits is necessary, it will appear 7016 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7017 * fields of vmcs01 and vmcs02, will turn these bits off - and 7018 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7019 * These rules have exceptions below. 7020 */ 7021 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7022 7023 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7024 7025 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7026 7027 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7028 7029 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7030 7031 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7032 7033 nested_vmx_setup_basic(msrs); 7034 7035 nested_vmx_setup_cr_fixed(msrs); 7036 7037 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7038 } 7039 7040 void nested_vmx_hardware_unsetup(void) 7041 { 7042 int i; 7043 7044 if (enable_shadow_vmcs) { 7045 for (i = 0; i < VMX_BITMAP_NR; i++) 7046 free_page((unsigned long)vmx_bitmap[i]); 7047 } 7048 } 7049 7050 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7051 { 7052 int i; 7053 7054 if (!cpu_has_vmx_shadow_vmcs()) 7055 enable_shadow_vmcs = 0; 7056 if (enable_shadow_vmcs) { 7057 for (i = 0; i < VMX_BITMAP_NR; i++) { 7058 /* 7059 * The vmx_bitmap is not tied to a VM and so should 7060 * not be charged to a memcg. 7061 */ 7062 vmx_bitmap[i] = (unsigned long *) 7063 __get_free_page(GFP_KERNEL); 7064 if (!vmx_bitmap[i]) { 7065 nested_vmx_hardware_unsetup(); 7066 return -ENOMEM; 7067 } 7068 } 7069 7070 init_vmcs_shadow_fields(); 7071 } 7072 7073 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7074 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7075 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7076 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7077 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7078 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7079 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7080 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7081 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7082 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7083 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7084 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7085 7086 return 0; 7087 } 7088 7089 struct kvm_x86_nested_ops vmx_nested_ops = { 7090 .leave_nested = vmx_leave_nested, 7091 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7092 .check_events = vmx_check_nested_events, 7093 .has_events = vmx_has_nested_events, 7094 .triple_fault = nested_vmx_triple_fault, 7095 .get_state = vmx_get_nested_state, 7096 .set_state = vmx_set_nested_state, 7097 .get_nested_state_pages = vmx_get_nested_state_pages, 7098 .write_log_dirty = nested_vmx_write_pml_buffer, 7099 .enable_evmcs = nested_enable_evmcs, 7100 .get_evmcs_version = nested_get_evmcs_version, 7101 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7102 }; 7103