1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/objtool.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "pmu.h" 14 #include "sgx.h" 15 #include "trace.h" 16 #include "vmx.h" 17 #include "x86.h" 18 #include "smm.h" 19 20 static bool __read_mostly enable_shadow_vmcs = 1; 21 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 22 23 static bool __read_mostly nested_early_check = 0; 24 module_param(nested_early_check, bool, S_IRUGO); 25 26 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 27 28 /* 29 * Hyper-V requires all of these, so mark them as supported even though 30 * they are just treated the same as all-context. 31 */ 32 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 33 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 34 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 35 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 36 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 37 38 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 39 40 enum { 41 VMX_VMREAD_BITMAP, 42 VMX_VMWRITE_BITMAP, 43 VMX_BITMAP_NR 44 }; 45 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 46 47 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 48 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 49 50 struct shadow_vmcs_field { 51 u16 encoding; 52 u16 offset; 53 }; 54 static struct shadow_vmcs_field shadow_read_only_fields[] = { 55 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 56 #include "vmcs_shadow_fields.h" 57 }; 58 static int max_shadow_read_only_fields = 59 ARRAY_SIZE(shadow_read_only_fields); 60 61 static struct shadow_vmcs_field shadow_read_write_fields[] = { 62 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 63 #include "vmcs_shadow_fields.h" 64 }; 65 static int max_shadow_read_write_fields = 66 ARRAY_SIZE(shadow_read_write_fields); 67 68 static void init_vmcs_shadow_fields(void) 69 { 70 int i, j; 71 72 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 73 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 74 75 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 76 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 77 u16 field = entry.encoding; 78 79 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 80 (i + 1 == max_shadow_read_only_fields || 81 shadow_read_only_fields[i + 1].encoding != field + 1)) 82 pr_err("Missing field from shadow_read_only_field %x\n", 83 field + 1); 84 85 clear_bit(field, vmx_vmread_bitmap); 86 if (field & 1) 87 #ifdef CONFIG_X86_64 88 continue; 89 #else 90 entry.offset += sizeof(u32); 91 #endif 92 shadow_read_only_fields[j++] = entry; 93 } 94 max_shadow_read_only_fields = j; 95 96 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 97 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 98 u16 field = entry.encoding; 99 100 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 101 (i + 1 == max_shadow_read_write_fields || 102 shadow_read_write_fields[i + 1].encoding != field + 1)) 103 pr_err("Missing field from shadow_read_write_field %x\n", 104 field + 1); 105 106 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 107 field <= GUEST_TR_AR_BYTES, 108 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 109 110 /* 111 * PML and the preemption timer can be emulated, but the 112 * processor cannot vmwrite to fields that don't exist 113 * on bare metal. 114 */ 115 switch (field) { 116 case GUEST_PML_INDEX: 117 if (!cpu_has_vmx_pml()) 118 continue; 119 break; 120 case VMX_PREEMPTION_TIMER_VALUE: 121 if (!cpu_has_vmx_preemption_timer()) 122 continue; 123 break; 124 case GUEST_INTR_STATUS: 125 if (!cpu_has_vmx_apicv()) 126 continue; 127 break; 128 default: 129 break; 130 } 131 132 clear_bit(field, vmx_vmwrite_bitmap); 133 clear_bit(field, vmx_vmread_bitmap); 134 if (field & 1) 135 #ifdef CONFIG_X86_64 136 continue; 137 #else 138 entry.offset += sizeof(u32); 139 #endif 140 shadow_read_write_fields[j++] = entry; 141 } 142 max_shadow_read_write_fields = j; 143 } 144 145 /* 146 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 147 * set the success or error code of an emulated VMX instruction (as specified 148 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 149 * instruction. 150 */ 151 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 152 { 153 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 154 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 155 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 156 return kvm_skip_emulated_instruction(vcpu); 157 } 158 159 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 160 { 161 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 162 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 163 X86_EFLAGS_SF | X86_EFLAGS_OF)) 164 | X86_EFLAGS_CF); 165 return kvm_skip_emulated_instruction(vcpu); 166 } 167 168 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 169 u32 vm_instruction_error) 170 { 171 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 172 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 173 X86_EFLAGS_SF | X86_EFLAGS_OF)) 174 | X86_EFLAGS_ZF); 175 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 176 /* 177 * We don't need to force sync to shadow VMCS because 178 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 179 * fields and thus must be synced. 180 */ 181 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 182 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 183 184 return kvm_skip_emulated_instruction(vcpu); 185 } 186 187 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 188 { 189 struct vcpu_vmx *vmx = to_vmx(vcpu); 190 191 /* 192 * failValid writes the error number to the current VMCS, which 193 * can't be done if there isn't a current VMCS. 194 */ 195 if (vmx->nested.current_vmptr == INVALID_GPA && 196 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 197 return nested_vmx_failInvalid(vcpu); 198 199 return nested_vmx_failValid(vcpu, vm_instruction_error); 200 } 201 202 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 203 { 204 /* TODO: not to reset guest simply here. */ 205 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 206 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 207 } 208 209 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 210 { 211 return fixed_bits_valid(control, low, high); 212 } 213 214 static inline u64 vmx_control_msr(u32 low, u32 high) 215 { 216 return low | ((u64)high << 32); 217 } 218 219 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 220 { 221 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 222 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 223 vmx->nested.need_vmcs12_to_shadow_sync = false; 224 } 225 226 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 227 { 228 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 229 struct vcpu_vmx *vmx = to_vmx(vcpu); 230 231 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 232 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 233 vmx->nested.hv_evmcs = NULL; 234 } 235 236 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 237 238 if (hv_vcpu) { 239 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 240 hv_vcpu->nested.vm_id = 0; 241 hv_vcpu->nested.vp_id = 0; 242 } 243 } 244 245 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 246 struct loaded_vmcs *prev) 247 { 248 struct vmcs_host_state *dest, *src; 249 250 if (unlikely(!vmx->guest_state_loaded)) 251 return; 252 253 src = &prev->host_state; 254 dest = &vmx->loaded_vmcs->host_state; 255 256 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 257 dest->ldt_sel = src->ldt_sel; 258 #ifdef CONFIG_X86_64 259 dest->ds_sel = src->ds_sel; 260 dest->es_sel = src->es_sel; 261 #endif 262 } 263 264 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 265 { 266 struct vcpu_vmx *vmx = to_vmx(vcpu); 267 struct loaded_vmcs *prev; 268 int cpu; 269 270 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 271 return; 272 273 cpu = get_cpu(); 274 prev = vmx->loaded_vmcs; 275 vmx->loaded_vmcs = vmcs; 276 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 277 vmx_sync_vmcs_host_state(vmx, prev); 278 put_cpu(); 279 280 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 281 282 /* 283 * All lazily updated registers will be reloaded from VMCS12 on both 284 * vmentry and vmexit. 285 */ 286 vcpu->arch.regs_dirty = 0; 287 } 288 289 /* 290 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 291 * just stops using VMX. 292 */ 293 static void free_nested(struct kvm_vcpu *vcpu) 294 { 295 struct vcpu_vmx *vmx = to_vmx(vcpu); 296 297 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 298 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 299 300 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 301 return; 302 303 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 304 305 vmx->nested.vmxon = false; 306 vmx->nested.smm.vmxon = false; 307 vmx->nested.vmxon_ptr = INVALID_GPA; 308 free_vpid(vmx->nested.vpid02); 309 vmx->nested.posted_intr_nv = -1; 310 vmx->nested.current_vmptr = INVALID_GPA; 311 if (enable_shadow_vmcs) { 312 vmx_disable_shadow_vmcs(vmx); 313 vmcs_clear(vmx->vmcs01.shadow_vmcs); 314 free_vmcs(vmx->vmcs01.shadow_vmcs); 315 vmx->vmcs01.shadow_vmcs = NULL; 316 } 317 kfree(vmx->nested.cached_vmcs12); 318 vmx->nested.cached_vmcs12 = NULL; 319 kfree(vmx->nested.cached_shadow_vmcs12); 320 vmx->nested.cached_shadow_vmcs12 = NULL; 321 /* 322 * Unpin physical memory we referred to in the vmcs02. The APIC access 323 * page's backing page (yeah, confusing) shouldn't actually be accessed, 324 * and if it is written, the contents are irrelevant. 325 */ 326 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 327 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 328 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 329 vmx->nested.pi_desc = NULL; 330 331 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 332 333 nested_release_evmcs(vcpu); 334 335 free_loaded_vmcs(&vmx->nested.vmcs02); 336 } 337 338 /* 339 * Ensure that the current vmcs of the logical processor is the 340 * vmcs01 of the vcpu before calling free_nested(). 341 */ 342 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 343 { 344 vcpu_load(vcpu); 345 vmx_leave_nested(vcpu); 346 vcpu_put(vcpu); 347 } 348 349 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 350 351 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 352 { 353 return VALID_PAGE(root_hpa) && 354 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 355 } 356 357 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 358 gpa_t addr) 359 { 360 uint i; 361 struct kvm_mmu_root_info *cached_root; 362 363 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 364 365 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 366 cached_root = &vcpu->arch.mmu->prev_roots[i]; 367 368 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 369 eptp)) 370 vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); 371 } 372 } 373 374 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 375 struct x86_exception *fault) 376 { 377 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 378 struct vcpu_vmx *vmx = to_vmx(vcpu); 379 u32 vm_exit_reason; 380 unsigned long exit_qualification = vcpu->arch.exit_qualification; 381 382 if (vmx->nested.pml_full) { 383 vm_exit_reason = EXIT_REASON_PML_FULL; 384 vmx->nested.pml_full = false; 385 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 386 } else { 387 if (fault->error_code & PFERR_RSVD_MASK) 388 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 389 else 390 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 391 392 /* 393 * Although the caller (kvm_inject_emulated_page_fault) would 394 * have already synced the faulting address in the shadow EPT 395 * tables for the current EPTP12, we also need to sync it for 396 * any other cached EPTP02s based on the same EP4TA, since the 397 * TLB associates mappings to the EP4TA rather than the full EPTP. 398 */ 399 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 400 fault->address); 401 } 402 403 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 404 vmcs12->guest_physical_address = fault->address; 405 } 406 407 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 408 { 409 struct vcpu_vmx *vmx = to_vmx(vcpu); 410 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 411 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 412 413 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 414 nested_ept_ad_enabled(vcpu), 415 nested_ept_get_eptp(vcpu)); 416 } 417 418 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 419 { 420 WARN_ON(mmu_is_nested(vcpu)); 421 422 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 423 nested_ept_new_eptp(vcpu); 424 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 425 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 426 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 427 428 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 429 } 430 431 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 432 { 433 vcpu->arch.mmu = &vcpu->arch.root_mmu; 434 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 435 } 436 437 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 438 u16 error_code) 439 { 440 bool inequality, bit; 441 442 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 443 inequality = 444 (error_code & vmcs12->page_fault_error_code_mask) != 445 vmcs12->page_fault_error_code_match; 446 return inequality ^ bit; 447 } 448 449 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 450 u32 error_code) 451 { 452 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 453 454 /* 455 * Drop bits 31:16 of the error code when performing the #PF mask+match 456 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 457 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 458 * error code. Including the to-be-dropped bits in the check might 459 * result in an "impossible" or missed exit from L1's perspective. 460 */ 461 if (vector == PF_VECTOR) 462 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 463 464 return (vmcs12->exception_bitmap & (1u << vector)); 465 } 466 467 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 468 struct vmcs12 *vmcs12) 469 { 470 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 471 return 0; 472 473 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 474 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 475 return -EINVAL; 476 477 return 0; 478 } 479 480 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 481 struct vmcs12 *vmcs12) 482 { 483 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 484 return 0; 485 486 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 487 return -EINVAL; 488 489 return 0; 490 } 491 492 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 493 struct vmcs12 *vmcs12) 494 { 495 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 496 return 0; 497 498 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 499 return -EINVAL; 500 501 return 0; 502 } 503 504 /* 505 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 506 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 507 * only the "disable intercept" case needs to be handled. 508 */ 509 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 510 unsigned long *msr_bitmap_l0, 511 u32 msr, int type) 512 { 513 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 514 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 515 516 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 517 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 518 } 519 520 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 521 { 522 int msr; 523 524 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 525 unsigned word = msr / BITS_PER_LONG; 526 527 msr_bitmap[word] = ~0; 528 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 529 } 530 } 531 532 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 533 static inline \ 534 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 535 unsigned long *msr_bitmap_l1, \ 536 unsigned long *msr_bitmap_l0, u32 msr) \ 537 { \ 538 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 539 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 540 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 541 else \ 542 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 543 } 544 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 545 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 546 547 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 548 unsigned long *msr_bitmap_l1, 549 unsigned long *msr_bitmap_l0, 550 u32 msr, int types) 551 { 552 if (types & MSR_TYPE_R) 553 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 554 msr_bitmap_l0, msr); 555 if (types & MSR_TYPE_W) 556 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 557 msr_bitmap_l0, msr); 558 } 559 560 /* 561 * Merge L0's and L1's MSR bitmap, return false to indicate that 562 * we do not use the hardware. 563 */ 564 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 565 struct vmcs12 *vmcs12) 566 { 567 struct vcpu_vmx *vmx = to_vmx(vcpu); 568 int msr; 569 unsigned long *msr_bitmap_l1; 570 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 571 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 572 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 573 574 /* Nothing to do if the MSR bitmap is not in use. */ 575 if (!cpu_has_vmx_msr_bitmap() || 576 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 577 return false; 578 579 /* 580 * MSR bitmap update can be skipped when: 581 * - MSR bitmap for L1 hasn't changed. 582 * - Nested hypervisor (L1) is attempting to launch the same L2 as 583 * before. 584 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 585 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 586 */ 587 if (!vmx->nested.force_msr_bitmap_recalc && evmcs && 588 evmcs->hv_enlightenments_control.msr_bitmap && 589 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 590 return true; 591 592 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 593 return false; 594 595 msr_bitmap_l1 = (unsigned long *)map->hva; 596 597 /* 598 * To keep the control flow simple, pay eight 8-byte writes (sixteen 599 * 4-byte writes on 32-bit systems) up front to enable intercepts for 600 * the x2APIC MSR range and selectively toggle those relevant to L2. 601 */ 602 enable_x2apic_msr_intercepts(msr_bitmap_l0); 603 604 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 605 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 606 /* 607 * L0 need not intercept reads for MSRs between 0x800 608 * and 0x8ff, it just lets the processor take the value 609 * from the virtual-APIC page; take those 256 bits 610 * directly from the L1 bitmap. 611 */ 612 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 613 unsigned word = msr / BITS_PER_LONG; 614 615 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 616 } 617 } 618 619 nested_vmx_disable_intercept_for_x2apic_msr( 620 msr_bitmap_l1, msr_bitmap_l0, 621 X2APIC_MSR(APIC_TASKPRI), 622 MSR_TYPE_R | MSR_TYPE_W); 623 624 if (nested_cpu_has_vid(vmcs12)) { 625 nested_vmx_disable_intercept_for_x2apic_msr( 626 msr_bitmap_l1, msr_bitmap_l0, 627 X2APIC_MSR(APIC_EOI), 628 MSR_TYPE_W); 629 nested_vmx_disable_intercept_for_x2apic_msr( 630 msr_bitmap_l1, msr_bitmap_l0, 631 X2APIC_MSR(APIC_SELF_IPI), 632 MSR_TYPE_W); 633 } 634 } 635 636 /* 637 * Always check vmcs01's bitmap to honor userspace MSR filters and any 638 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 639 */ 640 #ifdef CONFIG_X86_64 641 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 642 MSR_FS_BASE, MSR_TYPE_RW); 643 644 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 645 MSR_GS_BASE, MSR_TYPE_RW); 646 647 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 648 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 649 #endif 650 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 651 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 652 653 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 654 MSR_IA32_PRED_CMD, MSR_TYPE_W); 655 656 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 657 658 vmx->nested.force_msr_bitmap_recalc = false; 659 660 return true; 661 } 662 663 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 664 struct vmcs12 *vmcs12) 665 { 666 struct vcpu_vmx *vmx = to_vmx(vcpu); 667 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 668 669 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 670 vmcs12->vmcs_link_pointer == INVALID_GPA) 671 return; 672 673 if (ghc->gpa != vmcs12->vmcs_link_pointer && 674 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 675 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 676 return; 677 678 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 679 VMCS12_SIZE); 680 } 681 682 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 683 struct vmcs12 *vmcs12) 684 { 685 struct vcpu_vmx *vmx = to_vmx(vcpu); 686 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 687 688 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 689 vmcs12->vmcs_link_pointer == INVALID_GPA) 690 return; 691 692 if (ghc->gpa != vmcs12->vmcs_link_pointer && 693 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 694 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 695 return; 696 697 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 698 VMCS12_SIZE); 699 } 700 701 /* 702 * In nested virtualization, check if L1 has set 703 * VM_EXIT_ACK_INTR_ON_EXIT 704 */ 705 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 706 { 707 return get_vmcs12(vcpu)->vm_exit_controls & 708 VM_EXIT_ACK_INTR_ON_EXIT; 709 } 710 711 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 712 struct vmcs12 *vmcs12) 713 { 714 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 715 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 716 return -EINVAL; 717 else 718 return 0; 719 } 720 721 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 722 struct vmcs12 *vmcs12) 723 { 724 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 725 !nested_cpu_has_apic_reg_virt(vmcs12) && 726 !nested_cpu_has_vid(vmcs12) && 727 !nested_cpu_has_posted_intr(vmcs12)) 728 return 0; 729 730 /* 731 * If virtualize x2apic mode is enabled, 732 * virtualize apic access must be disabled. 733 */ 734 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 735 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 736 return -EINVAL; 737 738 /* 739 * If virtual interrupt delivery is enabled, 740 * we must exit on external interrupts. 741 */ 742 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 743 return -EINVAL; 744 745 /* 746 * bits 15:8 should be zero in posted_intr_nv, 747 * the descriptor address has been already checked 748 * in nested_get_vmcs12_pages. 749 * 750 * bits 5:0 of posted_intr_desc_addr should be zero. 751 */ 752 if (nested_cpu_has_posted_intr(vmcs12) && 753 (CC(!nested_cpu_has_vid(vmcs12)) || 754 CC(!nested_exit_intr_ack_set(vcpu)) || 755 CC((vmcs12->posted_intr_nv & 0xff00)) || 756 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 757 return -EINVAL; 758 759 /* tpr shadow is needed by all apicv features. */ 760 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 761 return -EINVAL; 762 763 return 0; 764 } 765 766 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 767 u32 count, u64 addr) 768 { 769 if (count == 0) 770 return 0; 771 772 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 773 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 774 return -EINVAL; 775 776 return 0; 777 } 778 779 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 780 struct vmcs12 *vmcs12) 781 { 782 if (CC(nested_vmx_check_msr_switch(vcpu, 783 vmcs12->vm_exit_msr_load_count, 784 vmcs12->vm_exit_msr_load_addr)) || 785 CC(nested_vmx_check_msr_switch(vcpu, 786 vmcs12->vm_exit_msr_store_count, 787 vmcs12->vm_exit_msr_store_addr))) 788 return -EINVAL; 789 790 return 0; 791 } 792 793 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 794 struct vmcs12 *vmcs12) 795 { 796 if (CC(nested_vmx_check_msr_switch(vcpu, 797 vmcs12->vm_entry_msr_load_count, 798 vmcs12->vm_entry_msr_load_addr))) 799 return -EINVAL; 800 801 return 0; 802 } 803 804 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 805 struct vmcs12 *vmcs12) 806 { 807 if (!nested_cpu_has_pml(vmcs12)) 808 return 0; 809 810 if (CC(!nested_cpu_has_ept(vmcs12)) || 811 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 812 return -EINVAL; 813 814 return 0; 815 } 816 817 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 818 struct vmcs12 *vmcs12) 819 { 820 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 821 !nested_cpu_has_ept(vmcs12))) 822 return -EINVAL; 823 return 0; 824 } 825 826 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 827 struct vmcs12 *vmcs12) 828 { 829 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 830 !nested_cpu_has_ept(vmcs12))) 831 return -EINVAL; 832 return 0; 833 } 834 835 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 836 struct vmcs12 *vmcs12) 837 { 838 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 839 return 0; 840 841 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 842 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 843 return -EINVAL; 844 845 return 0; 846 } 847 848 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 849 struct vmx_msr_entry *e) 850 { 851 /* x2APIC MSR accesses are not allowed */ 852 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 853 return -EINVAL; 854 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 855 CC(e->index == MSR_IA32_UCODE_REV)) 856 return -EINVAL; 857 if (CC(e->reserved != 0)) 858 return -EINVAL; 859 return 0; 860 } 861 862 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 863 struct vmx_msr_entry *e) 864 { 865 if (CC(e->index == MSR_FS_BASE) || 866 CC(e->index == MSR_GS_BASE) || 867 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 868 nested_vmx_msr_check_common(vcpu, e)) 869 return -EINVAL; 870 return 0; 871 } 872 873 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 874 struct vmx_msr_entry *e) 875 { 876 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 877 nested_vmx_msr_check_common(vcpu, e)) 878 return -EINVAL; 879 return 0; 880 } 881 882 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 883 { 884 struct vcpu_vmx *vmx = to_vmx(vcpu); 885 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 886 vmx->nested.msrs.misc_high); 887 888 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 889 } 890 891 /* 892 * Load guest's/host's msr at nested entry/exit. 893 * return 0 for success, entry index for failure. 894 * 895 * One of the failure modes for MSR load/store is when a list exceeds the 896 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 897 * as possible, process all valid entries before failing rather than precheck 898 * for a capacity violation. 899 */ 900 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 901 { 902 u32 i; 903 struct vmx_msr_entry e; 904 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 905 906 for (i = 0; i < count; i++) { 907 if (unlikely(i >= max_msr_list_size)) 908 goto fail; 909 910 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 911 &e, sizeof(e))) { 912 pr_debug_ratelimited( 913 "%s cannot read MSR entry (%u, 0x%08llx)\n", 914 __func__, i, gpa + i * sizeof(e)); 915 goto fail; 916 } 917 if (nested_vmx_load_msr_check(vcpu, &e)) { 918 pr_debug_ratelimited( 919 "%s check failed (%u, 0x%x, 0x%x)\n", 920 __func__, i, e.index, e.reserved); 921 goto fail; 922 } 923 if (kvm_set_msr(vcpu, e.index, e.value)) { 924 pr_debug_ratelimited( 925 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 926 __func__, i, e.index, e.value); 927 goto fail; 928 } 929 } 930 return 0; 931 fail: 932 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 933 return i + 1; 934 } 935 936 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 937 u32 msr_index, 938 u64 *data) 939 { 940 struct vcpu_vmx *vmx = to_vmx(vcpu); 941 942 /* 943 * If the L0 hypervisor stored a more accurate value for the TSC that 944 * does not include the time taken for emulation of the L2->L1 945 * VM-exit in L0, use the more accurate value. 946 */ 947 if (msr_index == MSR_IA32_TSC) { 948 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 949 MSR_IA32_TSC); 950 951 if (i >= 0) { 952 u64 val = vmx->msr_autostore.guest.val[i].value; 953 954 *data = kvm_read_l1_tsc(vcpu, val); 955 return true; 956 } 957 } 958 959 if (kvm_get_msr(vcpu, msr_index, data)) { 960 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 961 msr_index); 962 return false; 963 } 964 return true; 965 } 966 967 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 968 struct vmx_msr_entry *e) 969 { 970 if (kvm_vcpu_read_guest(vcpu, 971 gpa + i * sizeof(*e), 972 e, 2 * sizeof(u32))) { 973 pr_debug_ratelimited( 974 "%s cannot read MSR entry (%u, 0x%08llx)\n", 975 __func__, i, gpa + i * sizeof(*e)); 976 return false; 977 } 978 if (nested_vmx_store_msr_check(vcpu, e)) { 979 pr_debug_ratelimited( 980 "%s check failed (%u, 0x%x, 0x%x)\n", 981 __func__, i, e->index, e->reserved); 982 return false; 983 } 984 return true; 985 } 986 987 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 988 { 989 u64 data; 990 u32 i; 991 struct vmx_msr_entry e; 992 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 993 994 for (i = 0; i < count; i++) { 995 if (unlikely(i >= max_msr_list_size)) 996 return -EINVAL; 997 998 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 999 return -EINVAL; 1000 1001 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1002 return -EINVAL; 1003 1004 if (kvm_vcpu_write_guest(vcpu, 1005 gpa + i * sizeof(e) + 1006 offsetof(struct vmx_msr_entry, value), 1007 &data, sizeof(data))) { 1008 pr_debug_ratelimited( 1009 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1010 __func__, i, e.index, data); 1011 return -EINVAL; 1012 } 1013 } 1014 return 0; 1015 } 1016 1017 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1018 { 1019 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1020 u32 count = vmcs12->vm_exit_msr_store_count; 1021 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1022 struct vmx_msr_entry e; 1023 u32 i; 1024 1025 for (i = 0; i < count; i++) { 1026 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1027 return false; 1028 1029 if (e.index == msr_index) 1030 return true; 1031 } 1032 return false; 1033 } 1034 1035 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1036 u32 msr_index) 1037 { 1038 struct vcpu_vmx *vmx = to_vmx(vcpu); 1039 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1040 bool in_vmcs12_store_list; 1041 int msr_autostore_slot; 1042 bool in_autostore_list; 1043 int last; 1044 1045 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1046 in_autostore_list = msr_autostore_slot >= 0; 1047 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1048 1049 if (in_vmcs12_store_list && !in_autostore_list) { 1050 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1051 /* 1052 * Emulated VMEntry does not fail here. Instead a less 1053 * accurate value will be returned by 1054 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1055 * instead of reading the value from the vmcs02 VMExit 1056 * MSR-store area. 1057 */ 1058 pr_warn_ratelimited( 1059 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1060 msr_index); 1061 return; 1062 } 1063 last = autostore->nr++; 1064 autostore->val[last].index = msr_index; 1065 } else if (!in_vmcs12_store_list && in_autostore_list) { 1066 last = --autostore->nr; 1067 autostore->val[msr_autostore_slot] = autostore->val[last]; 1068 } 1069 } 1070 1071 /* 1072 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1073 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1074 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1075 * @entry_failure_code. 1076 */ 1077 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1078 bool nested_ept, bool reload_pdptrs, 1079 enum vm_entry_failure_code *entry_failure_code) 1080 { 1081 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1082 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1083 return -EINVAL; 1084 } 1085 1086 /* 1087 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1088 * must not be dereferenced. 1089 */ 1090 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1091 CC(!load_pdptrs(vcpu, cr3))) { 1092 *entry_failure_code = ENTRY_FAIL_PDPTE; 1093 return -EINVAL; 1094 } 1095 1096 vcpu->arch.cr3 = cr3; 1097 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1098 1099 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1100 kvm_init_mmu(vcpu); 1101 1102 if (!nested_ept) 1103 kvm_mmu_new_pgd(vcpu, cr3); 1104 1105 return 0; 1106 } 1107 1108 /* 1109 * Returns if KVM is able to config CPU to tag TLB entries 1110 * populated by L2 differently than TLB entries populated 1111 * by L1. 1112 * 1113 * If L0 uses EPT, L1 and L2 run with different EPTP because 1114 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1115 * are tagged with different EPTP. 1116 * 1117 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1118 * with different VPID (L1 entries are tagged with vmx->vpid 1119 * while L2 entries are tagged with vmx->nested.vpid02). 1120 */ 1121 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1122 { 1123 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1124 1125 return enable_ept || 1126 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1127 } 1128 1129 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1130 struct vmcs12 *vmcs12, 1131 bool is_vmenter) 1132 { 1133 struct vcpu_vmx *vmx = to_vmx(vcpu); 1134 1135 /* 1136 * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or 1137 * L2's VP_ID upon request from the guest. Make sure we check for 1138 * pending entries in the right FIFO upon L1/L2 transition as these 1139 * requests are put by other vCPUs asynchronously. 1140 */ 1141 if (to_hv_vcpu(vcpu) && enable_ept) 1142 kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); 1143 1144 /* 1145 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1146 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1147 * full TLB flush from the guest's perspective. This is required even 1148 * if VPID is disabled in the host as KVM may need to synchronize the 1149 * MMU in response to the guest TLB flush. 1150 * 1151 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1152 * EPT is a special snowflake, as guest-physical mappings aren't 1153 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1154 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1155 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1156 * those mappings. 1157 */ 1158 if (!nested_cpu_has_vpid(vmcs12)) { 1159 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1160 return; 1161 } 1162 1163 /* L2 should never have a VPID if VPID is disabled. */ 1164 WARN_ON(!enable_vpid); 1165 1166 /* 1167 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1168 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1169 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1170 * that the new vpid12 has never been used and thus represents a new 1171 * guest ASID that cannot have entries in the TLB. 1172 */ 1173 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1174 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1175 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1176 return; 1177 } 1178 1179 /* 1180 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1181 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1182 * KVM was unable to allocate a VPID for L2, flush the current context 1183 * as the effective ASID is common to both L1 and L2. 1184 */ 1185 if (!nested_has_guest_tlb_tag(vcpu)) 1186 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1187 } 1188 1189 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1190 { 1191 superset &= mask; 1192 subset &= mask; 1193 1194 return (superset | subset) == superset; 1195 } 1196 1197 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1198 { 1199 const u64 feature_and_reserved = 1200 /* feature (except bit 48; see below) */ 1201 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1202 /* reserved */ 1203 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1204 u64 vmx_basic = vmcs_config.nested.basic; 1205 1206 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1207 return -EINVAL; 1208 1209 /* 1210 * KVM does not emulate a version of VMX that constrains physical 1211 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1212 */ 1213 if (data & BIT_ULL(48)) 1214 return -EINVAL; 1215 1216 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1217 vmx_basic_vmcs_revision_id(data)) 1218 return -EINVAL; 1219 1220 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1221 return -EINVAL; 1222 1223 vmx->nested.msrs.basic = data; 1224 return 0; 1225 } 1226 1227 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1228 u32 **low, u32 **high) 1229 { 1230 switch (msr_index) { 1231 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1232 *low = &msrs->pinbased_ctls_low; 1233 *high = &msrs->pinbased_ctls_high; 1234 break; 1235 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1236 *low = &msrs->procbased_ctls_low; 1237 *high = &msrs->procbased_ctls_high; 1238 break; 1239 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1240 *low = &msrs->exit_ctls_low; 1241 *high = &msrs->exit_ctls_high; 1242 break; 1243 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1244 *low = &msrs->entry_ctls_low; 1245 *high = &msrs->entry_ctls_high; 1246 break; 1247 case MSR_IA32_VMX_PROCBASED_CTLS2: 1248 *low = &msrs->secondary_ctls_low; 1249 *high = &msrs->secondary_ctls_high; 1250 break; 1251 default: 1252 BUG(); 1253 } 1254 } 1255 1256 static int 1257 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1258 { 1259 u32 *lowp, *highp; 1260 u64 supported; 1261 1262 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1263 1264 supported = vmx_control_msr(*lowp, *highp); 1265 1266 /* Check must-be-1 bits are still 1. */ 1267 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1268 return -EINVAL; 1269 1270 /* Check must-be-0 bits are still 0. */ 1271 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1272 return -EINVAL; 1273 1274 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1275 *lowp = data; 1276 *highp = data >> 32; 1277 return 0; 1278 } 1279 1280 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1281 { 1282 const u64 feature_and_reserved_bits = 1283 /* feature */ 1284 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1285 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1286 /* reserved */ 1287 GENMASK_ULL(13, 9) | BIT_ULL(31); 1288 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1289 vmcs_config.nested.misc_high); 1290 1291 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1292 return -EINVAL; 1293 1294 if ((vmx->nested.msrs.pinbased_ctls_high & 1295 PIN_BASED_VMX_PREEMPTION_TIMER) && 1296 vmx_misc_preemption_timer_rate(data) != 1297 vmx_misc_preemption_timer_rate(vmx_misc)) 1298 return -EINVAL; 1299 1300 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1301 return -EINVAL; 1302 1303 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1304 return -EINVAL; 1305 1306 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1307 return -EINVAL; 1308 1309 vmx->nested.msrs.misc_low = data; 1310 vmx->nested.msrs.misc_high = data >> 32; 1311 1312 return 0; 1313 } 1314 1315 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1316 { 1317 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1318 vmcs_config.nested.vpid_caps); 1319 1320 /* Every bit is either reserved or a feature bit. */ 1321 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1322 return -EINVAL; 1323 1324 vmx->nested.msrs.ept_caps = data; 1325 vmx->nested.msrs.vpid_caps = data >> 32; 1326 return 0; 1327 } 1328 1329 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1330 { 1331 switch (msr_index) { 1332 case MSR_IA32_VMX_CR0_FIXED0: 1333 return &msrs->cr0_fixed0; 1334 case MSR_IA32_VMX_CR4_FIXED0: 1335 return &msrs->cr4_fixed0; 1336 default: 1337 BUG(); 1338 } 1339 } 1340 1341 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1342 { 1343 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1344 1345 /* 1346 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1347 * must be 1 in the restored value. 1348 */ 1349 if (!is_bitwise_subset(data, *msr, -1ULL)) 1350 return -EINVAL; 1351 1352 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1353 return 0; 1354 } 1355 1356 /* 1357 * Called when userspace is restoring VMX MSRs. 1358 * 1359 * Returns 0 on success, non-0 otherwise. 1360 */ 1361 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1362 { 1363 struct vcpu_vmx *vmx = to_vmx(vcpu); 1364 1365 /* 1366 * Don't allow changes to the VMX capability MSRs while the vCPU 1367 * is in VMX operation. 1368 */ 1369 if (vmx->nested.vmxon) 1370 return -EBUSY; 1371 1372 switch (msr_index) { 1373 case MSR_IA32_VMX_BASIC: 1374 return vmx_restore_vmx_basic(vmx, data); 1375 case MSR_IA32_VMX_PINBASED_CTLS: 1376 case MSR_IA32_VMX_PROCBASED_CTLS: 1377 case MSR_IA32_VMX_EXIT_CTLS: 1378 case MSR_IA32_VMX_ENTRY_CTLS: 1379 /* 1380 * The "non-true" VMX capability MSRs are generated from the 1381 * "true" MSRs, so we do not support restoring them directly. 1382 * 1383 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1384 * should restore the "true" MSRs with the must-be-1 bits 1385 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1386 * DEFAULT SETTINGS". 1387 */ 1388 return -EINVAL; 1389 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1390 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1391 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1392 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1393 case MSR_IA32_VMX_PROCBASED_CTLS2: 1394 return vmx_restore_control_msr(vmx, msr_index, data); 1395 case MSR_IA32_VMX_MISC: 1396 return vmx_restore_vmx_misc(vmx, data); 1397 case MSR_IA32_VMX_CR0_FIXED0: 1398 case MSR_IA32_VMX_CR4_FIXED0: 1399 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1400 case MSR_IA32_VMX_CR0_FIXED1: 1401 case MSR_IA32_VMX_CR4_FIXED1: 1402 /* 1403 * These MSRs are generated based on the vCPU's CPUID, so we 1404 * do not support restoring them directly. 1405 */ 1406 return -EINVAL; 1407 case MSR_IA32_VMX_EPT_VPID_CAP: 1408 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1409 case MSR_IA32_VMX_VMCS_ENUM: 1410 vmx->nested.msrs.vmcs_enum = data; 1411 return 0; 1412 case MSR_IA32_VMX_VMFUNC: 1413 if (data & ~vmcs_config.nested.vmfunc_controls) 1414 return -EINVAL; 1415 vmx->nested.msrs.vmfunc_controls = data; 1416 return 0; 1417 default: 1418 /* 1419 * The rest of the VMX capability MSRs do not support restore. 1420 */ 1421 return -EINVAL; 1422 } 1423 } 1424 1425 /* Returns 0 on success, non-0 otherwise. */ 1426 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1427 { 1428 switch (msr_index) { 1429 case MSR_IA32_VMX_BASIC: 1430 *pdata = msrs->basic; 1431 break; 1432 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1433 case MSR_IA32_VMX_PINBASED_CTLS: 1434 *pdata = vmx_control_msr( 1435 msrs->pinbased_ctls_low, 1436 msrs->pinbased_ctls_high); 1437 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1438 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1439 break; 1440 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1441 case MSR_IA32_VMX_PROCBASED_CTLS: 1442 *pdata = vmx_control_msr( 1443 msrs->procbased_ctls_low, 1444 msrs->procbased_ctls_high); 1445 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1446 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1447 break; 1448 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1449 case MSR_IA32_VMX_EXIT_CTLS: 1450 *pdata = vmx_control_msr( 1451 msrs->exit_ctls_low, 1452 msrs->exit_ctls_high); 1453 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1454 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1455 break; 1456 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1457 case MSR_IA32_VMX_ENTRY_CTLS: 1458 *pdata = vmx_control_msr( 1459 msrs->entry_ctls_low, 1460 msrs->entry_ctls_high); 1461 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1462 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1463 break; 1464 case MSR_IA32_VMX_MISC: 1465 *pdata = vmx_control_msr( 1466 msrs->misc_low, 1467 msrs->misc_high); 1468 break; 1469 case MSR_IA32_VMX_CR0_FIXED0: 1470 *pdata = msrs->cr0_fixed0; 1471 break; 1472 case MSR_IA32_VMX_CR0_FIXED1: 1473 *pdata = msrs->cr0_fixed1; 1474 break; 1475 case MSR_IA32_VMX_CR4_FIXED0: 1476 *pdata = msrs->cr4_fixed0; 1477 break; 1478 case MSR_IA32_VMX_CR4_FIXED1: 1479 *pdata = msrs->cr4_fixed1; 1480 break; 1481 case MSR_IA32_VMX_VMCS_ENUM: 1482 *pdata = msrs->vmcs_enum; 1483 break; 1484 case MSR_IA32_VMX_PROCBASED_CTLS2: 1485 *pdata = vmx_control_msr( 1486 msrs->secondary_ctls_low, 1487 msrs->secondary_ctls_high); 1488 break; 1489 case MSR_IA32_VMX_EPT_VPID_CAP: 1490 *pdata = msrs->ept_caps | 1491 ((u64)msrs->vpid_caps << 32); 1492 break; 1493 case MSR_IA32_VMX_VMFUNC: 1494 *pdata = msrs->vmfunc_controls; 1495 break; 1496 default: 1497 return 1; 1498 } 1499 1500 return 0; 1501 } 1502 1503 /* 1504 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1505 * been modified by the L1 guest. Note, "writable" in this context means 1506 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1507 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1508 * VM-exit information fields (which are actually writable if the vCPU is 1509 * configured to support "VMWRITE to any supported field in the VMCS"). 1510 */ 1511 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1512 { 1513 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1514 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1515 struct shadow_vmcs_field field; 1516 unsigned long val; 1517 int i; 1518 1519 if (WARN_ON(!shadow_vmcs)) 1520 return; 1521 1522 preempt_disable(); 1523 1524 vmcs_load(shadow_vmcs); 1525 1526 for (i = 0; i < max_shadow_read_write_fields; i++) { 1527 field = shadow_read_write_fields[i]; 1528 val = __vmcs_readl(field.encoding); 1529 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1530 } 1531 1532 vmcs_clear(shadow_vmcs); 1533 vmcs_load(vmx->loaded_vmcs->vmcs); 1534 1535 preempt_enable(); 1536 } 1537 1538 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1539 { 1540 const struct shadow_vmcs_field *fields[] = { 1541 shadow_read_write_fields, 1542 shadow_read_only_fields 1543 }; 1544 const int max_fields[] = { 1545 max_shadow_read_write_fields, 1546 max_shadow_read_only_fields 1547 }; 1548 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1549 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1550 struct shadow_vmcs_field field; 1551 unsigned long val; 1552 int i, q; 1553 1554 if (WARN_ON(!shadow_vmcs)) 1555 return; 1556 1557 vmcs_load(shadow_vmcs); 1558 1559 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1560 for (i = 0; i < max_fields[q]; i++) { 1561 field = fields[q][i]; 1562 val = vmcs12_read_any(vmcs12, field.encoding, 1563 field.offset); 1564 __vmcs_writel(field.encoding, val); 1565 } 1566 } 1567 1568 vmcs_clear(shadow_vmcs); 1569 vmcs_load(vmx->loaded_vmcs->vmcs); 1570 } 1571 1572 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1573 { 1574 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1575 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1576 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1577 1578 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1579 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1580 vmcs12->guest_rip = evmcs->guest_rip; 1581 1582 if (unlikely(!(hv_clean_fields & 1583 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1584 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1585 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1586 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1587 } 1588 1589 if (unlikely(!(hv_clean_fields & 1590 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1591 vmcs12->guest_rsp = evmcs->guest_rsp; 1592 vmcs12->guest_rflags = evmcs->guest_rflags; 1593 vmcs12->guest_interruptibility_info = 1594 evmcs->guest_interruptibility_info; 1595 /* 1596 * Not present in struct vmcs12: 1597 * vmcs12->guest_ssp = evmcs->guest_ssp; 1598 */ 1599 } 1600 1601 if (unlikely(!(hv_clean_fields & 1602 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1603 vmcs12->cpu_based_vm_exec_control = 1604 evmcs->cpu_based_vm_exec_control; 1605 } 1606 1607 if (unlikely(!(hv_clean_fields & 1608 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1609 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1610 } 1611 1612 if (unlikely(!(hv_clean_fields & 1613 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1614 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1615 } 1616 1617 if (unlikely(!(hv_clean_fields & 1618 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1619 vmcs12->vm_entry_intr_info_field = 1620 evmcs->vm_entry_intr_info_field; 1621 vmcs12->vm_entry_exception_error_code = 1622 evmcs->vm_entry_exception_error_code; 1623 vmcs12->vm_entry_instruction_len = 1624 evmcs->vm_entry_instruction_len; 1625 } 1626 1627 if (unlikely(!(hv_clean_fields & 1628 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1629 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1630 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1631 vmcs12->host_cr0 = evmcs->host_cr0; 1632 vmcs12->host_cr3 = evmcs->host_cr3; 1633 vmcs12->host_cr4 = evmcs->host_cr4; 1634 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1635 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1636 vmcs12->host_rip = evmcs->host_rip; 1637 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1638 vmcs12->host_es_selector = evmcs->host_es_selector; 1639 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1640 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1641 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1642 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1643 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1644 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1645 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1646 /* 1647 * Not present in struct vmcs12: 1648 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1649 * vmcs12->host_ssp = evmcs->host_ssp; 1650 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1651 */ 1652 } 1653 1654 if (unlikely(!(hv_clean_fields & 1655 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1656 vmcs12->pin_based_vm_exec_control = 1657 evmcs->pin_based_vm_exec_control; 1658 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1659 vmcs12->secondary_vm_exec_control = 1660 evmcs->secondary_vm_exec_control; 1661 } 1662 1663 if (unlikely(!(hv_clean_fields & 1664 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1665 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1666 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1667 } 1668 1669 if (unlikely(!(hv_clean_fields & 1670 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1671 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1672 } 1673 1674 if (unlikely(!(hv_clean_fields & 1675 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1676 vmcs12->guest_es_base = evmcs->guest_es_base; 1677 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1678 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1679 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1680 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1681 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1682 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1683 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1684 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1685 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1686 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1687 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1688 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1689 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1690 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1691 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1692 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1693 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1694 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1695 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1696 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1697 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1698 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1699 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1700 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1701 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1702 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1703 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1704 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1705 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1706 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1707 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1708 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1709 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1710 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1711 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1712 } 1713 1714 if (unlikely(!(hv_clean_fields & 1715 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1716 vmcs12->tsc_offset = evmcs->tsc_offset; 1717 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1718 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1719 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1720 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1721 } 1722 1723 if (unlikely(!(hv_clean_fields & 1724 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1725 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1726 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1727 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1728 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1729 vmcs12->guest_cr0 = evmcs->guest_cr0; 1730 vmcs12->guest_cr3 = evmcs->guest_cr3; 1731 vmcs12->guest_cr4 = evmcs->guest_cr4; 1732 vmcs12->guest_dr7 = evmcs->guest_dr7; 1733 } 1734 1735 if (unlikely(!(hv_clean_fields & 1736 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1737 vmcs12->host_fs_base = evmcs->host_fs_base; 1738 vmcs12->host_gs_base = evmcs->host_gs_base; 1739 vmcs12->host_tr_base = evmcs->host_tr_base; 1740 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1741 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1742 vmcs12->host_rsp = evmcs->host_rsp; 1743 } 1744 1745 if (unlikely(!(hv_clean_fields & 1746 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1747 vmcs12->ept_pointer = evmcs->ept_pointer; 1748 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1749 } 1750 1751 if (unlikely(!(hv_clean_fields & 1752 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1753 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1754 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1755 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1756 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1757 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1758 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1759 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1760 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1761 vmcs12->guest_pending_dbg_exceptions = 1762 evmcs->guest_pending_dbg_exceptions; 1763 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1764 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1765 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1766 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1767 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1768 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1769 /* 1770 * Not present in struct vmcs12: 1771 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1772 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1773 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1774 */ 1775 } 1776 1777 /* 1778 * Not used? 1779 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1780 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1781 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1782 * vmcs12->page_fault_error_code_mask = 1783 * evmcs->page_fault_error_code_mask; 1784 * vmcs12->page_fault_error_code_match = 1785 * evmcs->page_fault_error_code_match; 1786 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1787 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1788 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1789 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1790 */ 1791 1792 /* 1793 * Read only fields: 1794 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1795 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1796 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1797 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1798 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1799 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1800 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1801 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1802 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1803 * vmcs12->exit_qualification = evmcs->exit_qualification; 1804 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1805 * 1806 * Not present in struct vmcs12: 1807 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1808 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1809 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1810 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1811 */ 1812 1813 return; 1814 } 1815 1816 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1817 { 1818 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1819 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1820 1821 /* 1822 * Should not be changed by KVM: 1823 * 1824 * evmcs->host_es_selector = vmcs12->host_es_selector; 1825 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1826 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1827 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1828 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1829 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1830 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1831 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1832 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1833 * evmcs->host_cr0 = vmcs12->host_cr0; 1834 * evmcs->host_cr3 = vmcs12->host_cr3; 1835 * evmcs->host_cr4 = vmcs12->host_cr4; 1836 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1837 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1838 * evmcs->host_rip = vmcs12->host_rip; 1839 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1840 * evmcs->host_fs_base = vmcs12->host_fs_base; 1841 * evmcs->host_gs_base = vmcs12->host_gs_base; 1842 * evmcs->host_tr_base = vmcs12->host_tr_base; 1843 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1844 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1845 * evmcs->host_rsp = vmcs12->host_rsp; 1846 * sync_vmcs02_to_vmcs12() doesn't read these: 1847 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1848 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1849 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1850 * evmcs->ept_pointer = vmcs12->ept_pointer; 1851 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1852 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1853 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1854 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1855 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1856 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1857 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1858 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1859 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1860 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1861 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1862 * evmcs->page_fault_error_code_mask = 1863 * vmcs12->page_fault_error_code_mask; 1864 * evmcs->page_fault_error_code_match = 1865 * vmcs12->page_fault_error_code_match; 1866 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1867 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1868 * evmcs->tsc_offset = vmcs12->tsc_offset; 1869 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1870 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1871 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1872 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1873 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1874 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1875 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1876 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1877 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1878 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1879 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1880 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1881 * 1882 * Not present in struct vmcs12: 1883 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1884 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1885 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1886 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1887 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1888 * evmcs->host_ssp = vmcs12->host_ssp; 1889 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1890 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1891 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1892 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1893 * evmcs->guest_ssp = vmcs12->guest_ssp; 1894 */ 1895 1896 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1897 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1898 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1899 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1900 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1901 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1902 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1903 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1904 1905 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1906 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1907 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1908 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1909 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1910 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1911 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1912 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1913 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1914 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1915 1916 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1917 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1918 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1919 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1920 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1921 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1922 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1923 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1924 1925 evmcs->guest_es_base = vmcs12->guest_es_base; 1926 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1927 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1928 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1929 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1930 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1931 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1932 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1933 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1934 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1935 1936 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1937 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1938 1939 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1940 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1941 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1942 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1943 1944 evmcs->guest_pending_dbg_exceptions = 1945 vmcs12->guest_pending_dbg_exceptions; 1946 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1947 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1948 1949 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1950 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1951 1952 evmcs->guest_cr0 = vmcs12->guest_cr0; 1953 evmcs->guest_cr3 = vmcs12->guest_cr3; 1954 evmcs->guest_cr4 = vmcs12->guest_cr4; 1955 evmcs->guest_dr7 = vmcs12->guest_dr7; 1956 1957 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1958 1959 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1960 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1961 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1962 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1963 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1964 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1965 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1966 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1967 1968 evmcs->exit_qualification = vmcs12->exit_qualification; 1969 1970 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1971 evmcs->guest_rsp = vmcs12->guest_rsp; 1972 evmcs->guest_rflags = vmcs12->guest_rflags; 1973 1974 evmcs->guest_interruptibility_info = 1975 vmcs12->guest_interruptibility_info; 1976 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1977 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1978 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1979 evmcs->vm_entry_exception_error_code = 1980 vmcs12->vm_entry_exception_error_code; 1981 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1982 1983 evmcs->guest_rip = vmcs12->guest_rip; 1984 1985 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1986 1987 return; 1988 } 1989 1990 /* 1991 * This is an equivalent of the nested hypervisor executing the vmptrld 1992 * instruction. 1993 */ 1994 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1995 struct kvm_vcpu *vcpu, bool from_launch) 1996 { 1997 struct vcpu_vmx *vmx = to_vmx(vcpu); 1998 bool evmcs_gpa_changed = false; 1999 u64 evmcs_gpa; 2000 2001 if (likely(!guest_cpuid_has_evmcs(vcpu))) 2002 return EVMPTRLD_DISABLED; 2003 2004 evmcs_gpa = nested_get_evmptr(vcpu); 2005 if (!evmptr_is_valid(evmcs_gpa)) { 2006 nested_release_evmcs(vcpu); 2007 return EVMPTRLD_DISABLED; 2008 } 2009 2010 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2011 vmx->nested.current_vmptr = INVALID_GPA; 2012 2013 nested_release_evmcs(vcpu); 2014 2015 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2016 &vmx->nested.hv_evmcs_map)) 2017 return EVMPTRLD_ERROR; 2018 2019 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2020 2021 /* 2022 * Currently, KVM only supports eVMCS version 1 2023 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2024 * value to first u32 field of eVMCS which should specify eVMCS 2025 * VersionNumber. 2026 * 2027 * Guest should be aware of supported eVMCS versions by host by 2028 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2029 * expected to set this CPUID leaf according to the value 2030 * returned in vmcs_version from nested_enable_evmcs(). 2031 * 2032 * However, it turns out that Microsoft Hyper-V fails to comply 2033 * to their own invented interface: When Hyper-V use eVMCS, it 2034 * just sets first u32 field of eVMCS to revision_id specified 2035 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2036 * which is one of the supported versions specified in 2037 * CPUID.0x4000000A.EAX[0:15]. 2038 * 2039 * To overcome Hyper-V bug, we accept here either a supported 2040 * eVMCS version or VMCS12 revision_id as valid values for first 2041 * u32 field of eVMCS. 2042 */ 2043 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2044 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2045 nested_release_evmcs(vcpu); 2046 return EVMPTRLD_VMFAIL; 2047 } 2048 2049 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2050 2051 evmcs_gpa_changed = true; 2052 /* 2053 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2054 * reloaded from guest's memory (read only fields, fields not 2055 * present in struct hv_enlightened_vmcs, ...). Make sure there 2056 * are no leftovers. 2057 */ 2058 if (from_launch) { 2059 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2060 memset(vmcs12, 0, sizeof(*vmcs12)); 2061 vmcs12->hdr.revision_id = VMCS12_REVISION; 2062 } 2063 2064 } 2065 2066 /* 2067 * Clean fields data can't be used on VMLAUNCH and when we switch 2068 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2069 */ 2070 if (from_launch || evmcs_gpa_changed) { 2071 vmx->nested.hv_evmcs->hv_clean_fields &= 2072 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2073 2074 vmx->nested.force_msr_bitmap_recalc = true; 2075 } 2076 2077 return EVMPTRLD_SUCCEEDED; 2078 } 2079 2080 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2081 { 2082 struct vcpu_vmx *vmx = to_vmx(vcpu); 2083 2084 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2085 copy_vmcs12_to_enlightened(vmx); 2086 else 2087 copy_vmcs12_to_shadow(vmx); 2088 2089 vmx->nested.need_vmcs12_to_shadow_sync = false; 2090 } 2091 2092 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2093 { 2094 struct vcpu_vmx *vmx = 2095 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2096 2097 vmx->nested.preemption_timer_expired = true; 2098 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2099 kvm_vcpu_kick(&vmx->vcpu); 2100 2101 return HRTIMER_NORESTART; 2102 } 2103 2104 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2105 { 2106 struct vcpu_vmx *vmx = to_vmx(vcpu); 2107 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2108 2109 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2110 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2111 2112 if (!vmx->nested.has_preemption_timer_deadline) { 2113 vmx->nested.preemption_timer_deadline = 2114 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2115 vmx->nested.has_preemption_timer_deadline = true; 2116 } 2117 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2118 } 2119 2120 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2121 u64 preemption_timeout) 2122 { 2123 struct vcpu_vmx *vmx = to_vmx(vcpu); 2124 2125 /* 2126 * A timer value of zero is architecturally guaranteed to cause 2127 * a VMExit prior to executing any instructions in the guest. 2128 */ 2129 if (preemption_timeout == 0) { 2130 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2131 return; 2132 } 2133 2134 if (vcpu->arch.virtual_tsc_khz == 0) 2135 return; 2136 2137 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2138 preemption_timeout *= 1000000; 2139 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2140 hrtimer_start(&vmx->nested.preemption_timer, 2141 ktime_add_ns(ktime_get(), preemption_timeout), 2142 HRTIMER_MODE_ABS_PINNED); 2143 } 2144 2145 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2146 { 2147 if (vmx->nested.nested_run_pending && 2148 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2149 return vmcs12->guest_ia32_efer; 2150 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2151 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2152 else 2153 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2154 } 2155 2156 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2157 { 2158 struct kvm *kvm = vmx->vcpu.kvm; 2159 2160 /* 2161 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2162 * according to L0's settings (vmcs12 is irrelevant here). Host 2163 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2164 * will be set as needed prior to VMLAUNCH/VMRESUME. 2165 */ 2166 if (vmx->nested.vmcs02_initialized) 2167 return; 2168 vmx->nested.vmcs02_initialized = true; 2169 2170 /* 2171 * We don't care what the EPTP value is we just need to guarantee 2172 * it's valid so we don't get a false positive when doing early 2173 * consistency checks. 2174 */ 2175 if (enable_ept && nested_early_check) 2176 vmcs_write64(EPT_POINTER, 2177 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2178 2179 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2180 if (cpu_has_vmx_vmfunc()) 2181 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2182 2183 if (cpu_has_vmx_posted_intr()) 2184 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2185 2186 if (cpu_has_vmx_msr_bitmap()) 2187 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2188 2189 /* 2190 * PML is emulated for L2, but never enabled in hardware as the MMU 2191 * handles A/D emulation. Disabling PML for L2 also avoids having to 2192 * deal with filtering out L2 GPAs from the buffer. 2193 */ 2194 if (enable_pml) { 2195 vmcs_write64(PML_ADDRESS, 0); 2196 vmcs_write16(GUEST_PML_INDEX, -1); 2197 } 2198 2199 if (cpu_has_vmx_encls_vmexit()) 2200 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2201 2202 if (kvm_notify_vmexit_enabled(kvm)) 2203 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2204 2205 /* 2206 * Set the MSR load/store lists to match L0's settings. Only the 2207 * addresses are constant (for vmcs02), the counts can change based 2208 * on L2's behavior, e.g. switching to/from long mode. 2209 */ 2210 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2211 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2212 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2213 2214 vmx_set_constant_host_state(vmx); 2215 } 2216 2217 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2218 struct vmcs12 *vmcs12) 2219 { 2220 prepare_vmcs02_constant_state(vmx); 2221 2222 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2223 2224 if (enable_vpid) { 2225 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2226 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2227 else 2228 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2229 } 2230 } 2231 2232 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2233 struct vmcs12 *vmcs12) 2234 { 2235 u32 exec_control; 2236 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2237 2238 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2239 prepare_vmcs02_early_rare(vmx, vmcs12); 2240 2241 /* 2242 * PIN CONTROLS 2243 */ 2244 exec_control = __pin_controls_get(vmcs01); 2245 exec_control |= (vmcs12->pin_based_vm_exec_control & 2246 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2247 2248 /* Posted interrupts setting is only taken from vmcs12. */ 2249 vmx->nested.pi_pending = false; 2250 if (nested_cpu_has_posted_intr(vmcs12)) 2251 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2252 else 2253 exec_control &= ~PIN_BASED_POSTED_INTR; 2254 pin_controls_set(vmx, exec_control); 2255 2256 /* 2257 * EXEC CONTROLS 2258 */ 2259 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2260 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2261 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2262 exec_control &= ~CPU_BASED_TPR_SHADOW; 2263 exec_control |= vmcs12->cpu_based_vm_exec_control; 2264 2265 vmx->nested.l1_tpr_threshold = -1; 2266 if (exec_control & CPU_BASED_TPR_SHADOW) 2267 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2268 #ifdef CONFIG_X86_64 2269 else 2270 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2271 CPU_BASED_CR8_STORE_EXITING; 2272 #endif 2273 2274 /* 2275 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2276 * for I/O port accesses. 2277 */ 2278 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2279 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2280 2281 /* 2282 * This bit will be computed in nested_get_vmcs12_pages, because 2283 * we do not have access to L1's MSR bitmap yet. For now, keep 2284 * the same bit as before, hoping to avoid multiple VMWRITEs that 2285 * only set/clear this bit. 2286 */ 2287 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2288 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2289 2290 exec_controls_set(vmx, exec_control); 2291 2292 /* 2293 * SECONDARY EXEC CONTROLS 2294 */ 2295 if (cpu_has_secondary_exec_ctrls()) { 2296 exec_control = __secondary_exec_controls_get(vmcs01); 2297 2298 /* Take the following fields only from vmcs12 */ 2299 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2300 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2301 SECONDARY_EXEC_ENABLE_INVPCID | 2302 SECONDARY_EXEC_ENABLE_RDTSCP | 2303 SECONDARY_EXEC_XSAVES | 2304 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2305 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2306 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2307 SECONDARY_EXEC_ENABLE_VMFUNC | 2308 SECONDARY_EXEC_DESC); 2309 2310 if (nested_cpu_has(vmcs12, 2311 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2312 exec_control |= vmcs12->secondary_vm_exec_control; 2313 2314 /* PML is emulated and never enabled in hardware for L2. */ 2315 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2316 2317 /* VMCS shadowing for L2 is emulated for now */ 2318 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2319 2320 /* 2321 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2322 * will not have to rewrite the controls just for this bit. 2323 */ 2324 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2325 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2326 exec_control |= SECONDARY_EXEC_DESC; 2327 2328 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2329 vmcs_write16(GUEST_INTR_STATUS, 2330 vmcs12->guest_intr_status); 2331 2332 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2333 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2334 2335 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2336 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2337 2338 secondary_exec_controls_set(vmx, exec_control); 2339 } 2340 2341 /* 2342 * ENTRY CONTROLS 2343 * 2344 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2345 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2346 * on the related bits (if supported by the CPU) in the hope that 2347 * we can avoid VMWrites during vmx_set_efer(). 2348 * 2349 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2350 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2351 * do the same for L2. 2352 */ 2353 exec_control = __vm_entry_controls_get(vmcs01); 2354 exec_control |= (vmcs12->vm_entry_controls & 2355 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2356 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2357 if (cpu_has_load_ia32_efer()) { 2358 if (guest_efer & EFER_LMA) 2359 exec_control |= VM_ENTRY_IA32E_MODE; 2360 if (guest_efer != host_efer) 2361 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2362 } 2363 vm_entry_controls_set(vmx, exec_control); 2364 2365 /* 2366 * EXIT CONTROLS 2367 * 2368 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2369 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2370 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2371 */ 2372 exec_control = __vm_exit_controls_get(vmcs01); 2373 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2374 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2375 else 2376 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2377 vm_exit_controls_set(vmx, exec_control); 2378 2379 /* 2380 * Interrupt/Exception Fields 2381 */ 2382 if (vmx->nested.nested_run_pending) { 2383 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2384 vmcs12->vm_entry_intr_info_field); 2385 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2386 vmcs12->vm_entry_exception_error_code); 2387 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2388 vmcs12->vm_entry_instruction_len); 2389 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2390 vmcs12->guest_interruptibility_info); 2391 vmx->loaded_vmcs->nmi_known_unmasked = 2392 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2393 } else { 2394 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2395 } 2396 } 2397 2398 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2399 { 2400 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2401 2402 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2403 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2404 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2405 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2406 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2407 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2408 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2409 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2410 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2411 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2412 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2413 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2414 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2415 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2416 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2417 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2418 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2419 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2420 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2421 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2422 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2423 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2424 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2425 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2426 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2427 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2428 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2429 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2430 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2431 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2432 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2433 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2434 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2435 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2436 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2437 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2438 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2439 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2440 2441 vmx->segment_cache.bitmask = 0; 2442 } 2443 2444 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2445 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2446 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2447 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2448 vmcs12->guest_pending_dbg_exceptions); 2449 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2450 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2451 2452 /* 2453 * L1 may access the L2's PDPTR, so save them to construct 2454 * vmcs12 2455 */ 2456 if (enable_ept) { 2457 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2458 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2459 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2460 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2461 } 2462 2463 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2464 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2465 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2466 } 2467 2468 if (nested_cpu_has_xsaves(vmcs12)) 2469 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2470 2471 /* 2472 * Whether page-faults are trapped is determined by a combination of 2473 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2474 * doesn't care about page faults then we should set all of these to 2475 * L1's desires. However, if L0 does care about (some) page faults, it 2476 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2477 * simply ask to exit on each and every L2 page fault. This is done by 2478 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2479 * Note that below we don't need special code to set EB.PF beyond the 2480 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2481 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2482 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2483 */ 2484 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2485 /* 2486 * TODO: if both L0 and L1 need the same MASK and MATCH, 2487 * go ahead and use it? 2488 */ 2489 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2490 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2491 } else { 2492 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2493 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2494 } 2495 2496 if (cpu_has_vmx_apicv()) { 2497 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2498 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2499 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2500 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2501 } 2502 2503 /* 2504 * Make sure the msr_autostore list is up to date before we set the 2505 * count in the vmcs02. 2506 */ 2507 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2508 2509 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2510 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2511 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2512 2513 set_cr4_guest_host_mask(vmx); 2514 } 2515 2516 /* 2517 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2518 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2519 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2520 * guest in a way that will both be appropriate to L1's requests, and our 2521 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2522 * function also has additional necessary side-effects, like setting various 2523 * vcpu->arch fields. 2524 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2525 * is assigned to entry_failure_code on failure. 2526 */ 2527 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2528 bool from_vmentry, 2529 enum vm_entry_failure_code *entry_failure_code) 2530 { 2531 struct vcpu_vmx *vmx = to_vmx(vcpu); 2532 bool load_guest_pdptrs_vmcs12 = false; 2533 2534 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 2535 prepare_vmcs02_rare(vmx, vmcs12); 2536 vmx->nested.dirty_vmcs12 = false; 2537 2538 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || 2539 !(vmx->nested.hv_evmcs->hv_clean_fields & 2540 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2541 } 2542 2543 if (vmx->nested.nested_run_pending && 2544 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2545 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2546 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2547 } else { 2548 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2549 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2550 } 2551 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2552 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2553 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2554 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2555 2556 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2557 * bitwise-or of what L1 wants to trap for L2, and what we want to 2558 * trap. Note that CR0.TS also needs updating - we do this later. 2559 */ 2560 vmx_update_exception_bitmap(vcpu); 2561 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2562 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2563 2564 if (vmx->nested.nested_run_pending && 2565 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2566 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2567 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2568 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2569 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2570 } 2571 2572 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2573 vcpu->arch.l1_tsc_offset, 2574 vmx_get_l2_tsc_offset(vcpu), 2575 vmx_get_l2_tsc_multiplier(vcpu)); 2576 2577 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2578 vcpu->arch.l1_tsc_scaling_ratio, 2579 vmx_get_l2_tsc_multiplier(vcpu)); 2580 2581 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2582 if (kvm_caps.has_tsc_control) 2583 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2584 2585 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2586 2587 if (nested_cpu_has_ept(vmcs12)) 2588 nested_ept_init_mmu_context(vcpu); 2589 2590 /* 2591 * Override the CR0/CR4 read shadows after setting the effective guest 2592 * CR0/CR4. The common helpers also set the shadows, but they don't 2593 * account for vmcs12's cr0/4_guest_host_mask. 2594 */ 2595 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2596 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2597 2598 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2599 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2600 2601 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2602 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2603 vmx_set_efer(vcpu, vcpu->arch.efer); 2604 2605 /* 2606 * Guest state is invalid and unrestricted guest is disabled, 2607 * which means L1 attempted VMEntry to L2 with invalid state. 2608 * Fail the VMEntry. 2609 * 2610 * However when force loading the guest state (SMM exit or 2611 * loading nested state after migration, it is possible to 2612 * have invalid guest state now, which will be later fixed by 2613 * restoring L2 register state 2614 */ 2615 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2616 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2617 return -EINVAL; 2618 } 2619 2620 /* Shadow page tables on either EPT or shadow page tables. */ 2621 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2622 from_vmentry, entry_failure_code)) 2623 return -EINVAL; 2624 2625 /* 2626 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2627 * on nested VM-Exit, which can occur without actually running L2 and 2628 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2629 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2630 * transition to HLT instead of running L2. 2631 */ 2632 if (enable_ept) 2633 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2634 2635 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2636 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2637 is_pae_paging(vcpu)) { 2638 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2639 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2640 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2641 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2642 } 2643 2644 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2645 intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2646 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2647 vmcs12->guest_ia32_perf_global_ctrl))) { 2648 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2649 return -EINVAL; 2650 } 2651 2652 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2653 kvm_rip_write(vcpu, vmcs12->guest_rip); 2654 2655 /* 2656 * It was observed that genuine Hyper-V running in L1 doesn't reset 2657 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2658 * bits when it changes a field in eVMCS. Mark all fields as clean 2659 * here. 2660 */ 2661 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2662 vmx->nested.hv_evmcs->hv_clean_fields |= 2663 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2664 2665 return 0; 2666 } 2667 2668 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2669 { 2670 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2671 nested_cpu_has_virtual_nmis(vmcs12))) 2672 return -EINVAL; 2673 2674 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2675 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2676 return -EINVAL; 2677 2678 return 0; 2679 } 2680 2681 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2682 { 2683 struct vcpu_vmx *vmx = to_vmx(vcpu); 2684 2685 /* Check for memory type validity */ 2686 switch (new_eptp & VMX_EPTP_MT_MASK) { 2687 case VMX_EPTP_MT_UC: 2688 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2689 return false; 2690 break; 2691 case VMX_EPTP_MT_WB: 2692 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2693 return false; 2694 break; 2695 default: 2696 return false; 2697 } 2698 2699 /* Page-walk levels validity. */ 2700 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2701 case VMX_EPTP_PWL_5: 2702 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2703 return false; 2704 break; 2705 case VMX_EPTP_PWL_4: 2706 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2707 return false; 2708 break; 2709 default: 2710 return false; 2711 } 2712 2713 /* Reserved bits should not be set */ 2714 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2715 return false; 2716 2717 /* AD, if set, should be supported */ 2718 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2719 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2720 return false; 2721 } 2722 2723 return true; 2724 } 2725 2726 /* 2727 * Checks related to VM-Execution Control Fields 2728 */ 2729 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2730 struct vmcs12 *vmcs12) 2731 { 2732 struct vcpu_vmx *vmx = to_vmx(vcpu); 2733 2734 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2735 vmx->nested.msrs.pinbased_ctls_low, 2736 vmx->nested.msrs.pinbased_ctls_high)) || 2737 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2738 vmx->nested.msrs.procbased_ctls_low, 2739 vmx->nested.msrs.procbased_ctls_high))) 2740 return -EINVAL; 2741 2742 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2743 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2744 vmx->nested.msrs.secondary_ctls_low, 2745 vmx->nested.msrs.secondary_ctls_high))) 2746 return -EINVAL; 2747 2748 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2749 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2750 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2751 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2752 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2753 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2754 nested_vmx_check_nmi_controls(vmcs12) || 2755 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2756 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2757 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2758 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2759 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2760 return -EINVAL; 2761 2762 if (!nested_cpu_has_preemption_timer(vmcs12) && 2763 nested_cpu_has_save_preemption_timer(vmcs12)) 2764 return -EINVAL; 2765 2766 if (nested_cpu_has_ept(vmcs12) && 2767 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2768 return -EINVAL; 2769 2770 if (nested_cpu_has_vmfunc(vmcs12)) { 2771 if (CC(vmcs12->vm_function_control & 2772 ~vmx->nested.msrs.vmfunc_controls)) 2773 return -EINVAL; 2774 2775 if (nested_cpu_has_eptp_switching(vmcs12)) { 2776 if (CC(!nested_cpu_has_ept(vmcs12)) || 2777 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2778 return -EINVAL; 2779 } 2780 } 2781 2782 return 0; 2783 } 2784 2785 /* 2786 * Checks related to VM-Exit Control Fields 2787 */ 2788 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2789 struct vmcs12 *vmcs12) 2790 { 2791 struct vcpu_vmx *vmx = to_vmx(vcpu); 2792 2793 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2794 vmx->nested.msrs.exit_ctls_low, 2795 vmx->nested.msrs.exit_ctls_high)) || 2796 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2797 return -EINVAL; 2798 2799 return 0; 2800 } 2801 2802 /* 2803 * Checks related to VM-Entry Control Fields 2804 */ 2805 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2806 struct vmcs12 *vmcs12) 2807 { 2808 struct vcpu_vmx *vmx = to_vmx(vcpu); 2809 2810 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2811 vmx->nested.msrs.entry_ctls_low, 2812 vmx->nested.msrs.entry_ctls_high))) 2813 return -EINVAL; 2814 2815 /* 2816 * From the Intel SDM, volume 3: 2817 * Fields relevant to VM-entry event injection must be set properly. 2818 * These fields are the VM-entry interruption-information field, the 2819 * VM-entry exception error code, and the VM-entry instruction length. 2820 */ 2821 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2822 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2823 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2824 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2825 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2826 bool should_have_error_code; 2827 bool urg = nested_cpu_has2(vmcs12, 2828 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2829 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2830 2831 /* VM-entry interruption-info field: interruption type */ 2832 if (CC(intr_type == INTR_TYPE_RESERVED) || 2833 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2834 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2835 return -EINVAL; 2836 2837 /* VM-entry interruption-info field: vector */ 2838 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2839 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2840 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2841 return -EINVAL; 2842 2843 /* VM-entry interruption-info field: deliver error code */ 2844 should_have_error_code = 2845 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2846 x86_exception_has_error_code(vector); 2847 if (CC(has_error_code != should_have_error_code)) 2848 return -EINVAL; 2849 2850 /* VM-entry exception error code */ 2851 if (CC(has_error_code && 2852 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2853 return -EINVAL; 2854 2855 /* VM-entry interruption-info field: reserved bits */ 2856 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2857 return -EINVAL; 2858 2859 /* VM-entry instruction length */ 2860 switch (intr_type) { 2861 case INTR_TYPE_SOFT_EXCEPTION: 2862 case INTR_TYPE_SOFT_INTR: 2863 case INTR_TYPE_PRIV_SW_EXCEPTION: 2864 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2865 CC(vmcs12->vm_entry_instruction_len == 0 && 2866 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2867 return -EINVAL; 2868 } 2869 } 2870 2871 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2872 return -EINVAL; 2873 2874 return 0; 2875 } 2876 2877 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2878 struct vmcs12 *vmcs12) 2879 { 2880 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2881 nested_check_vm_exit_controls(vcpu, vmcs12) || 2882 nested_check_vm_entry_controls(vcpu, vmcs12)) 2883 return -EINVAL; 2884 2885 if (guest_cpuid_has_evmcs(vcpu)) 2886 return nested_evmcs_check_controls(vmcs12); 2887 2888 return 0; 2889 } 2890 2891 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2892 struct vmcs12 *vmcs12) 2893 { 2894 #ifdef CONFIG_X86_64 2895 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2896 !!(vcpu->arch.efer & EFER_LMA))) 2897 return -EINVAL; 2898 #endif 2899 return 0; 2900 } 2901 2902 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2903 struct vmcs12 *vmcs12) 2904 { 2905 bool ia32e; 2906 2907 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2908 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2909 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2910 return -EINVAL; 2911 2912 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2913 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2914 return -EINVAL; 2915 2916 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2917 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2918 return -EINVAL; 2919 2920 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2921 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2922 vmcs12->host_ia32_perf_global_ctrl))) 2923 return -EINVAL; 2924 2925 #ifdef CONFIG_X86_64 2926 ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2927 #else 2928 ia32e = false; 2929 #endif 2930 2931 if (ia32e) { 2932 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2933 return -EINVAL; 2934 } else { 2935 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2936 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2937 CC((vmcs12->host_rip) >> 32)) 2938 return -EINVAL; 2939 } 2940 2941 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2942 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2943 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2944 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2945 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2946 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2947 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2948 CC(vmcs12->host_cs_selector == 0) || 2949 CC(vmcs12->host_tr_selector == 0) || 2950 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2951 return -EINVAL; 2952 2953 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2954 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2955 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2956 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2957 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2958 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2959 return -EINVAL; 2960 2961 /* 2962 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2963 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2964 * the values of the LMA and LME bits in the field must each be that of 2965 * the host address-space size VM-exit control. 2966 */ 2967 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2968 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2969 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2970 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2971 return -EINVAL; 2972 } 2973 2974 return 0; 2975 } 2976 2977 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2978 struct vmcs12 *vmcs12) 2979 { 2980 struct vcpu_vmx *vmx = to_vmx(vcpu); 2981 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 2982 struct vmcs_hdr hdr; 2983 2984 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 2985 return 0; 2986 2987 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2988 return -EINVAL; 2989 2990 if (ghc->gpa != vmcs12->vmcs_link_pointer && 2991 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 2992 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 2993 return -EINVAL; 2994 2995 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 2996 offsetof(struct vmcs12, hdr), 2997 sizeof(hdr)))) 2998 return -EINVAL; 2999 3000 if (CC(hdr.revision_id != VMCS12_REVISION) || 3001 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3002 return -EINVAL; 3003 3004 return 0; 3005 } 3006 3007 /* 3008 * Checks related to Guest Non-register State 3009 */ 3010 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3011 { 3012 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3013 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3014 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3015 return -EINVAL; 3016 3017 return 0; 3018 } 3019 3020 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3021 struct vmcs12 *vmcs12, 3022 enum vm_entry_failure_code *entry_failure_code) 3023 { 3024 bool ia32e; 3025 3026 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3027 3028 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3029 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3030 return -EINVAL; 3031 3032 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3033 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3034 return -EINVAL; 3035 3036 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3037 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3038 return -EINVAL; 3039 3040 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3041 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3042 return -EINVAL; 3043 } 3044 3045 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3046 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3047 vmcs12->guest_ia32_perf_global_ctrl))) 3048 return -EINVAL; 3049 3050 /* 3051 * If the load IA32_EFER VM-entry control is 1, the following checks 3052 * are performed on the field for the IA32_EFER MSR: 3053 * - Bits reserved in the IA32_EFER MSR must be 0. 3054 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3055 * the IA-32e mode guest VM-exit control. It must also be identical 3056 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3057 * CR0.PG) is 1. 3058 */ 3059 if (to_vmx(vcpu)->nested.nested_run_pending && 3060 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3061 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 3062 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3063 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3064 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3065 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3066 return -EINVAL; 3067 } 3068 3069 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3070 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3071 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3072 return -EINVAL; 3073 3074 if (nested_check_guest_non_reg_state(vmcs12)) 3075 return -EINVAL; 3076 3077 return 0; 3078 } 3079 3080 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3081 { 3082 struct vcpu_vmx *vmx = to_vmx(vcpu); 3083 unsigned long cr3, cr4; 3084 bool vm_fail; 3085 3086 if (!nested_early_check) 3087 return 0; 3088 3089 if (vmx->msr_autoload.host.nr) 3090 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3091 if (vmx->msr_autoload.guest.nr) 3092 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3093 3094 preempt_disable(); 3095 3096 vmx_prepare_switch_to_guest(vcpu); 3097 3098 /* 3099 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3100 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3101 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3102 * there is no need to preserve other bits or save/restore the field. 3103 */ 3104 vmcs_writel(GUEST_RFLAGS, 0); 3105 3106 cr3 = __get_current_cr3_fast(); 3107 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3108 vmcs_writel(HOST_CR3, cr3); 3109 vmx->loaded_vmcs->host_state.cr3 = cr3; 3110 } 3111 3112 cr4 = cr4_read_shadow(); 3113 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3114 vmcs_writel(HOST_CR4, cr4); 3115 vmx->loaded_vmcs->host_state.cr4 = cr4; 3116 } 3117 3118 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3119 __vmx_vcpu_run_flags(vmx)); 3120 3121 if (vmx->msr_autoload.host.nr) 3122 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3123 if (vmx->msr_autoload.guest.nr) 3124 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3125 3126 if (vm_fail) { 3127 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3128 3129 preempt_enable(); 3130 3131 trace_kvm_nested_vmenter_failed( 3132 "early hardware check VM-instruction error: ", error); 3133 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3134 return 1; 3135 } 3136 3137 /* 3138 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3139 */ 3140 if (hw_breakpoint_active()) 3141 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3142 local_irq_enable(); 3143 preempt_enable(); 3144 3145 /* 3146 * A non-failing VMEntry means we somehow entered guest mode with 3147 * an illegal RIP, and that's just the tip of the iceberg. There 3148 * is no telling what memory has been modified or what state has 3149 * been exposed to unknown code. Hitting this all but guarantees 3150 * a (very critical) hardware issue. 3151 */ 3152 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3153 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3154 3155 return 0; 3156 } 3157 3158 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3159 { 3160 struct vcpu_vmx *vmx = to_vmx(vcpu); 3161 3162 /* 3163 * hv_evmcs may end up being not mapped after migration (when 3164 * L2 was running), map it here to make sure vmcs12 changes are 3165 * properly reflected. 3166 */ 3167 if (guest_cpuid_has_evmcs(vcpu) && 3168 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3169 enum nested_evmptrld_status evmptrld_status = 3170 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3171 3172 if (evmptrld_status == EVMPTRLD_VMFAIL || 3173 evmptrld_status == EVMPTRLD_ERROR) 3174 return false; 3175 3176 /* 3177 * Post migration VMCS12 always provides the most actual 3178 * information, copy it to eVMCS upon entry. 3179 */ 3180 vmx->nested.need_vmcs12_to_shadow_sync = true; 3181 } 3182 3183 return true; 3184 } 3185 3186 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3187 { 3188 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3189 struct vcpu_vmx *vmx = to_vmx(vcpu); 3190 struct kvm_host_map *map; 3191 3192 if (!vcpu->arch.pdptrs_from_userspace && 3193 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3194 /* 3195 * Reload the guest's PDPTRs since after a migration 3196 * the guest CR3 might be restored prior to setting the nested 3197 * state which can lead to a load of wrong PDPTRs. 3198 */ 3199 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3200 return false; 3201 } 3202 3203 3204 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3205 map = &vmx->nested.apic_access_page_map; 3206 3207 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3208 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3209 } else { 3210 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3211 __func__); 3212 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3213 vcpu->run->internal.suberror = 3214 KVM_INTERNAL_ERROR_EMULATION; 3215 vcpu->run->internal.ndata = 0; 3216 return false; 3217 } 3218 } 3219 3220 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3221 map = &vmx->nested.virtual_apic_map; 3222 3223 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3224 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3225 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3226 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3227 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3228 /* 3229 * The processor will never use the TPR shadow, simply 3230 * clear the bit from the execution control. Such a 3231 * configuration is useless, but it happens in tests. 3232 * For any other configuration, failing the vm entry is 3233 * _not_ what the processor does but it's basically the 3234 * only possibility we have. 3235 */ 3236 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3237 } else { 3238 /* 3239 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3240 * force VM-Entry to fail. 3241 */ 3242 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3243 } 3244 } 3245 3246 if (nested_cpu_has_posted_intr(vmcs12)) { 3247 map = &vmx->nested.pi_desc_map; 3248 3249 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3250 vmx->nested.pi_desc = 3251 (struct pi_desc *)(((void *)map->hva) + 3252 offset_in_page(vmcs12->posted_intr_desc_addr)); 3253 vmcs_write64(POSTED_INTR_DESC_ADDR, 3254 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3255 } else { 3256 /* 3257 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3258 * access the contents of the VMCS12 posted interrupt 3259 * descriptor. (Note that KVM may do this when it 3260 * should not, per the architectural specification.) 3261 */ 3262 vmx->nested.pi_desc = NULL; 3263 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3264 } 3265 } 3266 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3267 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3268 else 3269 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3270 3271 return true; 3272 } 3273 3274 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3275 { 3276 /* 3277 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3278 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3279 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3280 * migration. 3281 */ 3282 if (!nested_get_evmcs_page(vcpu)) { 3283 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3284 __func__); 3285 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3286 vcpu->run->internal.suberror = 3287 KVM_INTERNAL_ERROR_EMULATION; 3288 vcpu->run->internal.ndata = 0; 3289 3290 return false; 3291 } 3292 3293 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3294 return false; 3295 3296 return true; 3297 } 3298 3299 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3300 { 3301 struct vmcs12 *vmcs12; 3302 struct vcpu_vmx *vmx = to_vmx(vcpu); 3303 gpa_t dst; 3304 3305 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3306 return 0; 3307 3308 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3309 return 1; 3310 3311 /* 3312 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3313 * set is already checked as part of A/D emulation. 3314 */ 3315 vmcs12 = get_vmcs12(vcpu); 3316 if (!nested_cpu_has_pml(vmcs12)) 3317 return 0; 3318 3319 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3320 vmx->nested.pml_full = true; 3321 return 1; 3322 } 3323 3324 gpa &= ~0xFFFull; 3325 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3326 3327 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3328 offset_in_page(dst), sizeof(gpa))) 3329 return 0; 3330 3331 vmcs12->guest_pml_index--; 3332 3333 return 0; 3334 } 3335 3336 /* 3337 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3338 * for running VMX instructions (except VMXON, whose prerequisites are 3339 * slightly different). It also specifies what exception to inject otherwise. 3340 * Note that many of these exceptions have priority over VM exits, so they 3341 * don't have to be checked again here. 3342 */ 3343 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3344 { 3345 if (!to_vmx(vcpu)->nested.vmxon) { 3346 kvm_queue_exception(vcpu, UD_VECTOR); 3347 return 0; 3348 } 3349 3350 if (vmx_get_cpl(vcpu)) { 3351 kvm_inject_gp(vcpu, 0); 3352 return 0; 3353 } 3354 3355 return 1; 3356 } 3357 3358 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3359 { 3360 u8 rvi = vmx_get_rvi(); 3361 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3362 3363 return ((rvi & 0xf0) > (vppr & 0xf0)); 3364 } 3365 3366 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3367 struct vmcs12 *vmcs12); 3368 3369 /* 3370 * If from_vmentry is false, this is being called from state restore (either RSM 3371 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3372 * 3373 * Returns: 3374 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3375 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3376 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3377 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3378 */ 3379 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3380 bool from_vmentry) 3381 { 3382 struct vcpu_vmx *vmx = to_vmx(vcpu); 3383 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3384 enum vm_entry_failure_code entry_failure_code; 3385 bool evaluate_pending_interrupts; 3386 union vmx_exit_reason exit_reason = { 3387 .basic = EXIT_REASON_INVALID_STATE, 3388 .failed_vmentry = 1, 3389 }; 3390 u32 failed_index; 3391 3392 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3393 vmx->nested.current_vmptr, 3394 vmcs12->guest_rip, 3395 vmcs12->guest_intr_status, 3396 vmcs12->vm_entry_intr_info_field, 3397 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3398 vmcs12->ept_pointer, 3399 vmcs12->guest_cr3, 3400 KVM_ISA_VMX); 3401 3402 kvm_service_local_tlb_flush_requests(vcpu); 3403 3404 evaluate_pending_interrupts = exec_controls_get(vmx) & 3405 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3406 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3407 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3408 if (!evaluate_pending_interrupts) 3409 evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); 3410 3411 if (!vmx->nested.nested_run_pending || 3412 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3413 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3414 if (kvm_mpx_supported() && 3415 (!vmx->nested.nested_run_pending || 3416 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3417 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3418 3419 /* 3420 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3421 * nested early checks are disabled. In the event of a "late" VM-Fail, 3422 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3423 * software model to the pre-VMEntry host state. When EPT is disabled, 3424 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3425 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3426 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3427 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3428 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3429 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3430 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3431 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3432 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3433 * path would need to manually save/restore vmcs01.GUEST_CR3. 3434 */ 3435 if (!enable_ept && !nested_early_check) 3436 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3437 3438 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3439 3440 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3441 3442 if (from_vmentry) { 3443 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3444 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3445 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3446 } 3447 3448 if (nested_vmx_check_vmentry_hw(vcpu)) { 3449 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3450 return NVMX_VMENTRY_VMFAIL; 3451 } 3452 3453 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3454 &entry_failure_code)) { 3455 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3456 vmcs12->exit_qualification = entry_failure_code; 3457 goto vmentry_fail_vmexit; 3458 } 3459 } 3460 3461 enter_guest_mode(vcpu); 3462 3463 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3464 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3465 vmcs12->exit_qualification = entry_failure_code; 3466 goto vmentry_fail_vmexit_guest_mode; 3467 } 3468 3469 if (from_vmentry) { 3470 failed_index = nested_vmx_load_msr(vcpu, 3471 vmcs12->vm_entry_msr_load_addr, 3472 vmcs12->vm_entry_msr_load_count); 3473 if (failed_index) { 3474 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3475 vmcs12->exit_qualification = failed_index; 3476 goto vmentry_fail_vmexit_guest_mode; 3477 } 3478 } else { 3479 /* 3480 * The MMU is not initialized to point at the right entities yet and 3481 * "get pages" would need to read data from the guest (i.e. we will 3482 * need to perform gpa to hpa translation). Request a call 3483 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3484 * have already been set at vmentry time and should not be reset. 3485 */ 3486 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3487 } 3488 3489 /* 3490 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3491 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3492 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3493 * unconditionally. 3494 */ 3495 if (unlikely(evaluate_pending_interrupts)) 3496 kvm_make_request(KVM_REQ_EVENT, vcpu); 3497 3498 /* 3499 * Do not start the preemption timer hrtimer until after we know 3500 * we are successful, so that only nested_vmx_vmexit needs to cancel 3501 * the timer. 3502 */ 3503 vmx->nested.preemption_timer_expired = false; 3504 if (nested_cpu_has_preemption_timer(vmcs12)) { 3505 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3506 vmx_start_preemption_timer(vcpu, timer_value); 3507 } 3508 3509 /* 3510 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3511 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3512 * returned as far as L1 is concerned. It will only return (and set 3513 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3514 */ 3515 return NVMX_VMENTRY_SUCCESS; 3516 3517 /* 3518 * A failed consistency check that leads to a VMExit during L1's 3519 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3520 * 26.7 "VM-entry failures during or after loading guest state". 3521 */ 3522 vmentry_fail_vmexit_guest_mode: 3523 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3524 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3525 leave_guest_mode(vcpu); 3526 3527 vmentry_fail_vmexit: 3528 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3529 3530 if (!from_vmentry) 3531 return NVMX_VMENTRY_VMEXIT; 3532 3533 load_vmcs12_host_state(vcpu, vmcs12); 3534 vmcs12->vm_exit_reason = exit_reason.full; 3535 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 3536 vmx->nested.need_vmcs12_to_shadow_sync = true; 3537 return NVMX_VMENTRY_VMEXIT; 3538 } 3539 3540 /* 3541 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3542 * for running an L2 nested guest. 3543 */ 3544 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3545 { 3546 struct vmcs12 *vmcs12; 3547 enum nvmx_vmentry_status status; 3548 struct vcpu_vmx *vmx = to_vmx(vcpu); 3549 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3550 enum nested_evmptrld_status evmptrld_status; 3551 3552 if (!nested_vmx_check_permission(vcpu)) 3553 return 1; 3554 3555 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3556 if (evmptrld_status == EVMPTRLD_ERROR) { 3557 kvm_queue_exception(vcpu, UD_VECTOR); 3558 return 1; 3559 } 3560 3561 kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 3562 3563 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3564 return nested_vmx_failInvalid(vcpu); 3565 3566 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && 3567 vmx->nested.current_vmptr == INVALID_GPA)) 3568 return nested_vmx_failInvalid(vcpu); 3569 3570 vmcs12 = get_vmcs12(vcpu); 3571 3572 /* 3573 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3574 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3575 * rather than RFLAGS.ZF, and no error number is stored to the 3576 * VM-instruction error field. 3577 */ 3578 if (CC(vmcs12->hdr.shadow_vmcs)) 3579 return nested_vmx_failInvalid(vcpu); 3580 3581 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 3582 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); 3583 /* Enlightened VMCS doesn't have launch state */ 3584 vmcs12->launch_state = !launch; 3585 } else if (enable_shadow_vmcs) { 3586 copy_shadow_to_vmcs12(vmx); 3587 } 3588 3589 /* 3590 * The nested entry process starts with enforcing various prerequisites 3591 * on vmcs12 as required by the Intel SDM, and act appropriately when 3592 * they fail: As the SDM explains, some conditions should cause the 3593 * instruction to fail, while others will cause the instruction to seem 3594 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3595 * To speed up the normal (success) code path, we should avoid checking 3596 * for misconfigurations which will anyway be caught by the processor 3597 * when using the merged vmcs02. 3598 */ 3599 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3600 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3601 3602 if (CC(vmcs12->launch_state == launch)) 3603 return nested_vmx_fail(vcpu, 3604 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3605 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3606 3607 if (nested_vmx_check_controls(vcpu, vmcs12)) 3608 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3609 3610 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3611 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3612 3613 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3614 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3615 3616 /* 3617 * We're finally done with prerequisite checking, and can start with 3618 * the nested entry. 3619 */ 3620 vmx->nested.nested_run_pending = 1; 3621 vmx->nested.has_preemption_timer_deadline = false; 3622 status = nested_vmx_enter_non_root_mode(vcpu, true); 3623 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3624 goto vmentry_failed; 3625 3626 /* Emulate processing of posted interrupts on VM-Enter. */ 3627 if (nested_cpu_has_posted_intr(vmcs12) && 3628 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3629 vmx->nested.pi_pending = true; 3630 kvm_make_request(KVM_REQ_EVENT, vcpu); 3631 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3632 } 3633 3634 /* Hide L1D cache contents from the nested guest. */ 3635 vmx->vcpu.arch.l1tf_flush_l1d = true; 3636 3637 /* 3638 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3639 * also be used as part of restoring nVMX state for 3640 * snapshot restore (migration). 3641 * 3642 * In this flow, it is assumed that vmcs12 cache was 3643 * transferred as part of captured nVMX state and should 3644 * therefore not be read from guest memory (which may not 3645 * exist on destination host yet). 3646 */ 3647 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3648 3649 switch (vmcs12->guest_activity_state) { 3650 case GUEST_ACTIVITY_HLT: 3651 /* 3652 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3653 * awakened by event injection or by an NMI-window VM-exit or 3654 * by an interrupt-window VM-exit, halt the vcpu. 3655 */ 3656 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3657 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3658 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3659 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3660 vmx->nested.nested_run_pending = 0; 3661 return kvm_emulate_halt_noskip(vcpu); 3662 } 3663 break; 3664 case GUEST_ACTIVITY_WAIT_SIPI: 3665 vmx->nested.nested_run_pending = 0; 3666 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3667 break; 3668 default: 3669 break; 3670 } 3671 3672 return 1; 3673 3674 vmentry_failed: 3675 vmx->nested.nested_run_pending = 0; 3676 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3677 return 0; 3678 if (status == NVMX_VMENTRY_VMEXIT) 3679 return 1; 3680 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3681 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3682 } 3683 3684 /* 3685 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3686 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3687 * This function returns the new value we should put in vmcs12.guest_cr0. 3688 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3689 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3690 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3691 * didn't trap the bit, because if L1 did, so would L0). 3692 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3693 * been modified by L2, and L1 knows it. So just leave the old value of 3694 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3695 * isn't relevant, because if L0 traps this bit it can set it to anything. 3696 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3697 * changed these bits, and therefore they need to be updated, but L0 3698 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3699 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3700 */ 3701 static inline unsigned long 3702 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3703 { 3704 return 3705 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3706 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3707 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3708 vcpu->arch.cr0_guest_owned_bits)); 3709 } 3710 3711 static inline unsigned long 3712 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3713 { 3714 return 3715 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3716 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3717 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3718 vcpu->arch.cr4_guest_owned_bits)); 3719 } 3720 3721 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3722 struct vmcs12 *vmcs12, 3723 u32 vm_exit_reason, u32 exit_intr_info) 3724 { 3725 u32 idt_vectoring; 3726 unsigned int nr; 3727 3728 /* 3729 * Per the SDM, VM-Exits due to double and triple faults are never 3730 * considered to occur during event delivery, even if the double/triple 3731 * fault is the result of an escalating vectoring issue. 3732 * 3733 * Note, the SDM qualifies the double fault behavior with "The original 3734 * event results in a double-fault exception". It's unclear why the 3735 * qualification exists since exits due to double fault can occur only 3736 * while vectoring a different exception (injected events are never 3737 * subject to interception), i.e. there's _always_ an original event. 3738 * 3739 * The SDM also uses NMI as a confusing example for the "original event 3740 * causes the VM exit directly" clause. NMI isn't special in any way, 3741 * the same rule applies to all events that cause an exit directly. 3742 * NMI is an odd choice for the example because NMIs can only occur on 3743 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3744 */ 3745 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3746 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3747 is_double_fault(exit_intr_info))) { 3748 vmcs12->idt_vectoring_info_field = 0; 3749 } else if (vcpu->arch.exception.injected) { 3750 nr = vcpu->arch.exception.vector; 3751 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3752 3753 if (kvm_exception_is_soft(nr)) { 3754 vmcs12->vm_exit_instruction_len = 3755 vcpu->arch.event_exit_inst_len; 3756 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3757 } else 3758 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3759 3760 if (vcpu->arch.exception.has_error_code) { 3761 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3762 vmcs12->idt_vectoring_error_code = 3763 vcpu->arch.exception.error_code; 3764 } 3765 3766 vmcs12->idt_vectoring_info_field = idt_vectoring; 3767 } else if (vcpu->arch.nmi_injected) { 3768 vmcs12->idt_vectoring_info_field = 3769 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3770 } else if (vcpu->arch.interrupt.injected) { 3771 nr = vcpu->arch.interrupt.nr; 3772 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3773 3774 if (vcpu->arch.interrupt.soft) { 3775 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3776 vmcs12->vm_entry_instruction_len = 3777 vcpu->arch.event_exit_inst_len; 3778 } else 3779 idt_vectoring |= INTR_TYPE_EXT_INTR; 3780 3781 vmcs12->idt_vectoring_info_field = idt_vectoring; 3782 } else { 3783 vmcs12->idt_vectoring_info_field = 0; 3784 } 3785 } 3786 3787 3788 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3789 { 3790 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3791 gfn_t gfn; 3792 3793 /* 3794 * Don't need to mark the APIC access page dirty; it is never 3795 * written to by the CPU during APIC virtualization. 3796 */ 3797 3798 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3799 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3800 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3801 } 3802 3803 if (nested_cpu_has_posted_intr(vmcs12)) { 3804 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3805 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3806 } 3807 } 3808 3809 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3810 { 3811 struct vcpu_vmx *vmx = to_vmx(vcpu); 3812 int max_irr; 3813 void *vapic_page; 3814 u16 status; 3815 3816 if (!vmx->nested.pi_pending) 3817 return 0; 3818 3819 if (!vmx->nested.pi_desc) 3820 goto mmio_needed; 3821 3822 vmx->nested.pi_pending = false; 3823 3824 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3825 return 0; 3826 3827 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3828 if (max_irr != 256) { 3829 vapic_page = vmx->nested.virtual_apic_map.hva; 3830 if (!vapic_page) 3831 goto mmio_needed; 3832 3833 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3834 vapic_page, &max_irr); 3835 status = vmcs_read16(GUEST_INTR_STATUS); 3836 if ((u8)max_irr > ((u8)status & 0xff)) { 3837 status &= ~0xff; 3838 status |= (u8)max_irr; 3839 vmcs_write16(GUEST_INTR_STATUS, status); 3840 } 3841 } 3842 3843 nested_mark_vmcs12_pages_dirty(vcpu); 3844 return 0; 3845 3846 mmio_needed: 3847 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3848 return -ENXIO; 3849 } 3850 3851 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3852 { 3853 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3854 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3855 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3856 unsigned long exit_qual; 3857 3858 if (ex->has_payload) { 3859 exit_qual = ex->payload; 3860 } else if (ex->vector == PF_VECTOR) { 3861 exit_qual = vcpu->arch.cr2; 3862 } else if (ex->vector == DB_VECTOR) { 3863 exit_qual = vcpu->arch.dr6; 3864 exit_qual &= ~DR6_BT; 3865 exit_qual ^= DR6_ACTIVE_LOW; 3866 } else { 3867 exit_qual = 0; 3868 } 3869 3870 if (ex->has_error_code) { 3871 /* 3872 * Intel CPUs do not generate error codes with bits 31:16 set, 3873 * and more importantly VMX disallows setting bits 31:16 in the 3874 * injected error code for VM-Entry. Drop the bits to mimic 3875 * hardware and avoid inducing failure on nested VM-Entry if L1 3876 * chooses to inject the exception back to L2. AMD CPUs _do_ 3877 * generate "full" 32-bit error codes, so KVM allows userspace 3878 * to inject exception error codes with bits 31:16 set. 3879 */ 3880 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 3881 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3882 } 3883 3884 if (kvm_exception_is_soft(ex->vector)) 3885 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3886 else 3887 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3888 3889 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3890 vmx_get_nmi_mask(vcpu)) 3891 intr_info |= INTR_INFO_UNBLOCK_NMI; 3892 3893 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3894 } 3895 3896 /* 3897 * Returns true if a debug trap is (likely) pending delivery. Infer the class 3898 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 3899 * Using the payload is flawed because code breakpoints (fault-like) and data 3900 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 3901 * this will return false positives if a to-be-injected code breakpoint #DB is 3902 * pending (from KVM's perspective, but not "pending" across an instruction 3903 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 3904 * too is trap-like. 3905 * 3906 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 3907 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 3908 * #DB has already happened), and MTF isn't marked pending on code breakpoints 3909 * from the emulator (because such #DBs are fault-like and thus don't trigger 3910 * actions that fire on instruction retire). 3911 */ 3912 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 3913 { 3914 if (!ex->pending || ex->vector != DB_VECTOR) 3915 return 0; 3916 3917 /* General Detect #DBs are always fault-like. */ 3918 return ex->payload & ~DR6_BD; 3919 } 3920 3921 /* 3922 * Returns true if there's a pending #DB exception that is lower priority than 3923 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 3924 * KVM, but could theoretically be injected by userspace. Note, this code is 3925 * imperfect, see above. 3926 */ 3927 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 3928 { 3929 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 3930 } 3931 3932 /* 3933 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3934 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3935 * represents these debug traps with a payload that is said to be compatible 3936 * with the 'pending debug exceptions' field, write the payload to the VMCS 3937 * field if a VM-exit is delivered before the debug trap. 3938 */ 3939 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3940 { 3941 unsigned long pending_dbg; 3942 3943 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 3944 if (pending_dbg) 3945 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 3946 } 3947 3948 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3949 { 3950 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3951 to_vmx(vcpu)->nested.preemption_timer_expired; 3952 } 3953 3954 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) 3955 { 3956 return nested_vmx_preemption_timer_pending(vcpu) || 3957 to_vmx(vcpu)->nested.mtf_pending; 3958 } 3959 3960 /* 3961 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 3962 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 3963 * and less minor edits to splice in the priority of VMX Non-Root specific 3964 * events, e.g. MTF and NMI/INTR-window exiting. 3965 * 3966 * 1 Hardware Reset and Machine Checks 3967 * - RESET 3968 * - Machine Check 3969 * 3970 * 2 Trap on Task Switch 3971 * - T flag in TSS is set (on task switch) 3972 * 3973 * 3 External Hardware Interventions 3974 * - FLUSH 3975 * - STOPCLK 3976 * - SMI 3977 * - INIT 3978 * 3979 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 3980 * 3981 * 4 Traps on Previous Instruction 3982 * - Breakpoints 3983 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 3984 * breakpoint, or #DB due to a split-lock access) 3985 * 3986 * 4.3 VMX-preemption timer expired VM-exit 3987 * 3988 * 4.6 NMI-window exiting VM-exit[2] 3989 * 3990 * 5 Nonmaskable Interrupts (NMI) 3991 * 3992 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 3993 * 3994 * 6 Maskable Hardware Interrupts 3995 * 3996 * 7 Code Breakpoint Fault 3997 * 3998 * 8 Faults from Fetching Next Instruction 3999 * - Code-Segment Limit Violation 4000 * - Code Page Fault 4001 * - Control protection exception (missing ENDBRANCH at target of indirect 4002 * call or jump) 4003 * 4004 * 9 Faults from Decoding Next Instruction 4005 * - Instruction length > 15 bytes 4006 * - Invalid Opcode 4007 * - Coprocessor Not Available 4008 * 4009 *10 Faults on Executing Instruction 4010 * - Overflow 4011 * - Bound error 4012 * - Invalid TSS 4013 * - Segment Not Present 4014 * - Stack fault 4015 * - General Protection 4016 * - Data Page Fault 4017 * - Alignment Check 4018 * - x86 FPU Floating-point exception 4019 * - SIMD floating-point exception 4020 * - Virtualization exception 4021 * - Control protection exception 4022 * 4023 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4024 * INIT signals, and higher priority events take priority over MTF VM exits. 4025 * MTF VM exits take priority over debug-trap exceptions and lower priority 4026 * events. 4027 * 4028 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4029 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4030 * timer take priority over VM exits caused by the "NMI-window exiting" 4031 * VM-execution control and lower priority events. 4032 * 4033 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4034 * caused by "NMI-window exiting". VM exits caused by this control take 4035 * priority over non-maskable interrupts (NMIs) and lower priority events. 4036 * 4037 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4038 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4039 * non-maskable interrupts (NMIs) and higher priority events take priority over 4040 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4041 * priority over external interrupts and lower priority events. 4042 */ 4043 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4044 { 4045 struct kvm_lapic *apic = vcpu->arch.apic; 4046 struct vcpu_vmx *vmx = to_vmx(vcpu); 4047 /* 4048 * Only a pending nested run blocks a pending exception. If there is a 4049 * previously injected event, the pending exception occurred while said 4050 * event was being delivered and thus needs to be handled. 4051 */ 4052 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4053 /* 4054 * New events (not exceptions) are only recognized at instruction 4055 * boundaries. If an event needs reinjection, then KVM is handling a 4056 * VM-Exit that occurred _during_ instruction execution; new events are 4057 * blocked until the instruction completes. 4058 */ 4059 bool block_nested_events = block_nested_exceptions || 4060 kvm_event_needs_reinjection(vcpu); 4061 4062 if (lapic_in_kernel(vcpu) && 4063 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4064 if (block_nested_events) 4065 return -EBUSY; 4066 nested_vmx_update_pending_dbg(vcpu); 4067 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4068 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4069 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4070 4071 /* MTF is discarded if the vCPU is in WFS. */ 4072 vmx->nested.mtf_pending = false; 4073 return 0; 4074 } 4075 4076 if (lapic_in_kernel(vcpu) && 4077 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4078 if (block_nested_events) 4079 return -EBUSY; 4080 4081 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4082 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4083 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4084 apic->sipi_vector & 0xFFUL); 4085 return 0; 4086 } 4087 /* Fallthrough, the SIPI is completely ignored. */ 4088 } 4089 4090 /* 4091 * Process exceptions that are higher priority than Monitor Trap Flag: 4092 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4093 * could theoretically come in from userspace), and ICEBP (INT1). 4094 * 4095 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4096 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4097 * across SMI/RSM as it should; that needs to be addressed in order to 4098 * prioritize SMI over MTF and trap-like #DBs. 4099 */ 4100 if (vcpu->arch.exception_vmexit.pending && 4101 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4102 if (block_nested_exceptions) 4103 return -EBUSY; 4104 4105 nested_vmx_inject_exception_vmexit(vcpu); 4106 return 0; 4107 } 4108 4109 if (vcpu->arch.exception.pending && 4110 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4111 if (block_nested_exceptions) 4112 return -EBUSY; 4113 goto no_vmexit; 4114 } 4115 4116 if (vmx->nested.mtf_pending) { 4117 if (block_nested_events) 4118 return -EBUSY; 4119 nested_vmx_update_pending_dbg(vcpu); 4120 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4121 return 0; 4122 } 4123 4124 if (vcpu->arch.exception_vmexit.pending) { 4125 if (block_nested_exceptions) 4126 return -EBUSY; 4127 4128 nested_vmx_inject_exception_vmexit(vcpu); 4129 return 0; 4130 } 4131 4132 if (vcpu->arch.exception.pending) { 4133 if (block_nested_exceptions) 4134 return -EBUSY; 4135 goto no_vmexit; 4136 } 4137 4138 if (nested_vmx_preemption_timer_pending(vcpu)) { 4139 if (block_nested_events) 4140 return -EBUSY; 4141 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4142 return 0; 4143 } 4144 4145 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4146 if (block_nested_events) 4147 return -EBUSY; 4148 goto no_vmexit; 4149 } 4150 4151 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4152 if (block_nested_events) 4153 return -EBUSY; 4154 if (!nested_exit_on_nmi(vcpu)) 4155 goto no_vmexit; 4156 4157 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4158 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4159 INTR_INFO_VALID_MASK, 0); 4160 /* 4161 * The NMI-triggered VM exit counts as injection: 4162 * clear this one and block further NMIs. 4163 */ 4164 vcpu->arch.nmi_pending = 0; 4165 vmx_set_nmi_mask(vcpu, true); 4166 return 0; 4167 } 4168 4169 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4170 if (block_nested_events) 4171 return -EBUSY; 4172 if (!nested_exit_on_intr(vcpu)) 4173 goto no_vmexit; 4174 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4175 return 0; 4176 } 4177 4178 no_vmexit: 4179 return vmx_complete_nested_posted_interrupt(vcpu); 4180 } 4181 4182 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4183 { 4184 ktime_t remaining = 4185 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4186 u64 value; 4187 4188 if (ktime_to_ns(remaining) <= 0) 4189 return 0; 4190 4191 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4192 do_div(value, 1000000); 4193 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4194 } 4195 4196 static bool is_vmcs12_ext_field(unsigned long field) 4197 { 4198 switch (field) { 4199 case GUEST_ES_SELECTOR: 4200 case GUEST_CS_SELECTOR: 4201 case GUEST_SS_SELECTOR: 4202 case GUEST_DS_SELECTOR: 4203 case GUEST_FS_SELECTOR: 4204 case GUEST_GS_SELECTOR: 4205 case GUEST_LDTR_SELECTOR: 4206 case GUEST_TR_SELECTOR: 4207 case GUEST_ES_LIMIT: 4208 case GUEST_CS_LIMIT: 4209 case GUEST_SS_LIMIT: 4210 case GUEST_DS_LIMIT: 4211 case GUEST_FS_LIMIT: 4212 case GUEST_GS_LIMIT: 4213 case GUEST_LDTR_LIMIT: 4214 case GUEST_TR_LIMIT: 4215 case GUEST_GDTR_LIMIT: 4216 case GUEST_IDTR_LIMIT: 4217 case GUEST_ES_AR_BYTES: 4218 case GUEST_DS_AR_BYTES: 4219 case GUEST_FS_AR_BYTES: 4220 case GUEST_GS_AR_BYTES: 4221 case GUEST_LDTR_AR_BYTES: 4222 case GUEST_TR_AR_BYTES: 4223 case GUEST_ES_BASE: 4224 case GUEST_CS_BASE: 4225 case GUEST_SS_BASE: 4226 case GUEST_DS_BASE: 4227 case GUEST_FS_BASE: 4228 case GUEST_GS_BASE: 4229 case GUEST_LDTR_BASE: 4230 case GUEST_TR_BASE: 4231 case GUEST_GDTR_BASE: 4232 case GUEST_IDTR_BASE: 4233 case GUEST_PENDING_DBG_EXCEPTIONS: 4234 case GUEST_BNDCFGS: 4235 return true; 4236 default: 4237 break; 4238 } 4239 4240 return false; 4241 } 4242 4243 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4244 struct vmcs12 *vmcs12) 4245 { 4246 struct vcpu_vmx *vmx = to_vmx(vcpu); 4247 4248 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4249 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4250 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4251 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4252 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4253 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4254 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4255 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4256 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4257 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4258 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4259 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4260 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4261 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4262 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4263 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4264 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4265 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4266 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4267 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4268 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4269 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4270 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4271 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4272 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4273 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4274 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4275 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4276 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4277 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4278 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4279 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4280 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4281 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4282 vmcs12->guest_pending_dbg_exceptions = 4283 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4284 4285 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4286 } 4287 4288 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4289 struct vmcs12 *vmcs12) 4290 { 4291 struct vcpu_vmx *vmx = to_vmx(vcpu); 4292 int cpu; 4293 4294 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4295 return; 4296 4297 4298 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4299 4300 cpu = get_cpu(); 4301 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4302 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4303 4304 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4305 4306 vmx->loaded_vmcs = &vmx->vmcs01; 4307 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4308 put_cpu(); 4309 } 4310 4311 /* 4312 * Update the guest state fields of vmcs12 to reflect changes that 4313 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4314 * VM-entry controls is also updated, since this is really a guest 4315 * state bit.) 4316 */ 4317 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4318 { 4319 struct vcpu_vmx *vmx = to_vmx(vcpu); 4320 4321 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 4322 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4323 4324 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4325 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); 4326 4327 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4328 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4329 4330 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4331 vmcs12->guest_rip = kvm_rip_read(vcpu); 4332 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4333 4334 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4335 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4336 4337 vmcs12->guest_interruptibility_info = 4338 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4339 4340 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4341 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4342 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4343 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4344 else 4345 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4346 4347 if (nested_cpu_has_preemption_timer(vmcs12) && 4348 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4349 !vmx->nested.nested_run_pending) 4350 vmcs12->vmx_preemption_timer_value = 4351 vmx_get_preemption_timer_value(vcpu); 4352 4353 /* 4354 * In some cases (usually, nested EPT), L2 is allowed to change its 4355 * own CR3 without exiting. If it has changed it, we must keep it. 4356 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4357 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4358 * 4359 * Additionally, restore L2's PDPTR to vmcs12. 4360 */ 4361 if (enable_ept) { 4362 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4363 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4364 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4365 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4366 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4367 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4368 } 4369 } 4370 4371 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4372 4373 if (nested_cpu_has_vid(vmcs12)) 4374 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4375 4376 vmcs12->vm_entry_controls = 4377 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4378 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4379 4380 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4381 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4382 4383 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4384 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4385 } 4386 4387 /* 4388 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4389 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4390 * and this function updates it to reflect the changes to the guest state while 4391 * L2 was running (and perhaps made some exits which were handled directly by L0 4392 * without going back to L1), and to reflect the exit reason. 4393 * Note that we do not have to copy here all VMCS fields, just those that 4394 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4395 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4396 * which already writes to vmcs12 directly. 4397 */ 4398 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4399 u32 vm_exit_reason, u32 exit_intr_info, 4400 unsigned long exit_qualification) 4401 { 4402 /* update exit information fields: */ 4403 vmcs12->vm_exit_reason = vm_exit_reason; 4404 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4405 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4406 vmcs12->exit_qualification = exit_qualification; 4407 4408 /* 4409 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4410 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4411 * exit info fields are unmodified. 4412 */ 4413 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4414 vmcs12->launch_state = 1; 4415 4416 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4417 * instead of reading the real value. */ 4418 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4419 4420 /* 4421 * Transfer the event that L0 or L1 may wanted to inject into 4422 * L2 to IDT_VECTORING_INFO_FIELD. 4423 */ 4424 vmcs12_save_pending_event(vcpu, vmcs12, 4425 vm_exit_reason, exit_intr_info); 4426 4427 vmcs12->vm_exit_intr_info = exit_intr_info; 4428 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4429 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4430 4431 /* 4432 * According to spec, there's no need to store the guest's 4433 * MSRs if the exit is due to a VM-entry failure that occurs 4434 * during or after loading the guest state. Since this exit 4435 * does not fall in that category, we need to save the MSRs. 4436 */ 4437 if (nested_vmx_store_msr(vcpu, 4438 vmcs12->vm_exit_msr_store_addr, 4439 vmcs12->vm_exit_msr_store_count)) 4440 nested_vmx_abort(vcpu, 4441 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4442 } 4443 } 4444 4445 /* 4446 * A part of what we need to when the nested L2 guest exits and we want to 4447 * run its L1 parent, is to reset L1's guest state to the host state specified 4448 * in vmcs12. 4449 * This function is to be called not only on normal nested exit, but also on 4450 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4451 * Failures During or After Loading Guest State"). 4452 * This function should be called when the active VMCS is L1's (vmcs01). 4453 */ 4454 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4455 struct vmcs12 *vmcs12) 4456 { 4457 enum vm_entry_failure_code ignored; 4458 struct kvm_segment seg; 4459 4460 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4461 vcpu->arch.efer = vmcs12->host_ia32_efer; 4462 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4463 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4464 else 4465 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4466 vmx_set_efer(vcpu, vcpu->arch.efer); 4467 4468 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4469 kvm_rip_write(vcpu, vmcs12->host_rip); 4470 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4471 vmx_set_interrupt_shadow(vcpu, 0); 4472 4473 /* 4474 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4475 * actually changed, because vmx_set_cr0 refers to efer set above. 4476 * 4477 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4478 * (KVM doesn't change it); 4479 */ 4480 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4481 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4482 4483 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4484 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4485 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4486 4487 nested_ept_uninit_mmu_context(vcpu); 4488 4489 /* 4490 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4491 * couldn't have changed. 4492 */ 4493 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4494 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4495 4496 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4497 4498 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4499 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4500 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4501 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4502 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4503 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4504 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4505 4506 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4507 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4508 vmcs_write64(GUEST_BNDCFGS, 0); 4509 4510 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4511 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4512 vcpu->arch.pat = vmcs12->host_ia32_pat; 4513 } 4514 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4515 intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4516 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4517 vmcs12->host_ia32_perf_global_ctrl)); 4518 4519 /* Set L1 segment info according to Intel SDM 4520 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4521 seg = (struct kvm_segment) { 4522 .base = 0, 4523 .limit = 0xFFFFFFFF, 4524 .selector = vmcs12->host_cs_selector, 4525 .type = 11, 4526 .present = 1, 4527 .s = 1, 4528 .g = 1 4529 }; 4530 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4531 seg.l = 1; 4532 else 4533 seg.db = 1; 4534 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4535 seg = (struct kvm_segment) { 4536 .base = 0, 4537 .limit = 0xFFFFFFFF, 4538 .type = 3, 4539 .present = 1, 4540 .s = 1, 4541 .db = 1, 4542 .g = 1 4543 }; 4544 seg.selector = vmcs12->host_ds_selector; 4545 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4546 seg.selector = vmcs12->host_es_selector; 4547 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4548 seg.selector = vmcs12->host_ss_selector; 4549 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4550 seg.selector = vmcs12->host_fs_selector; 4551 seg.base = vmcs12->host_fs_base; 4552 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4553 seg.selector = vmcs12->host_gs_selector; 4554 seg.base = vmcs12->host_gs_base; 4555 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4556 seg = (struct kvm_segment) { 4557 .base = vmcs12->host_tr_base, 4558 .limit = 0x67, 4559 .selector = vmcs12->host_tr_selector, 4560 .type = 11, 4561 .present = 1 4562 }; 4563 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4564 4565 memset(&seg, 0, sizeof(seg)); 4566 seg.unusable = 1; 4567 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4568 4569 kvm_set_dr(vcpu, 7, 0x400); 4570 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4571 4572 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4573 vmcs12->vm_exit_msr_load_count)) 4574 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4575 4576 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4577 } 4578 4579 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4580 { 4581 struct vmx_uret_msr *efer_msr; 4582 unsigned int i; 4583 4584 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4585 return vmcs_read64(GUEST_IA32_EFER); 4586 4587 if (cpu_has_load_ia32_efer()) 4588 return host_efer; 4589 4590 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4591 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4592 return vmx->msr_autoload.guest.val[i].value; 4593 } 4594 4595 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4596 if (efer_msr) 4597 return efer_msr->data; 4598 4599 return host_efer; 4600 } 4601 4602 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4603 { 4604 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4605 struct vcpu_vmx *vmx = to_vmx(vcpu); 4606 struct vmx_msr_entry g, h; 4607 gpa_t gpa; 4608 u32 i, j; 4609 4610 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4611 4612 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4613 /* 4614 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4615 * as vmcs01.GUEST_DR7 contains a userspace defined value 4616 * and vcpu->arch.dr7 is not squirreled away before the 4617 * nested VMENTER (not worth adding a variable in nested_vmx). 4618 */ 4619 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4620 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4621 else 4622 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4623 } 4624 4625 /* 4626 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4627 * handle a variety of side effects to KVM's software model. 4628 */ 4629 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4630 4631 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4632 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4633 4634 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4635 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4636 4637 nested_ept_uninit_mmu_context(vcpu); 4638 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4639 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4640 4641 /* 4642 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4643 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4644 * VMFail, like everything else we just need to ensure our 4645 * software model is up-to-date. 4646 */ 4647 if (enable_ept && is_pae_paging(vcpu)) 4648 ept_save_pdptrs(vcpu); 4649 4650 kvm_mmu_reset_context(vcpu); 4651 4652 /* 4653 * This nasty bit of open coding is a compromise between blindly 4654 * loading L1's MSRs using the exit load lists (incorrect emulation 4655 * of VMFail), leaving the nested VM's MSRs in the software model 4656 * (incorrect behavior) and snapshotting the modified MSRs (too 4657 * expensive since the lists are unbound by hardware). For each 4658 * MSR that was (prematurely) loaded from the nested VMEntry load 4659 * list, reload it from the exit load list if it exists and differs 4660 * from the guest value. The intent is to stuff host state as 4661 * silently as possible, not to fully process the exit load list. 4662 */ 4663 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4664 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4665 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4666 pr_debug_ratelimited( 4667 "%s read MSR index failed (%u, 0x%08llx)\n", 4668 __func__, i, gpa); 4669 goto vmabort; 4670 } 4671 4672 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4673 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4674 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4675 pr_debug_ratelimited( 4676 "%s read MSR failed (%u, 0x%08llx)\n", 4677 __func__, j, gpa); 4678 goto vmabort; 4679 } 4680 if (h.index != g.index) 4681 continue; 4682 if (h.value == g.value) 4683 break; 4684 4685 if (nested_vmx_load_msr_check(vcpu, &h)) { 4686 pr_debug_ratelimited( 4687 "%s check failed (%u, 0x%x, 0x%x)\n", 4688 __func__, j, h.index, h.reserved); 4689 goto vmabort; 4690 } 4691 4692 if (kvm_set_msr(vcpu, h.index, h.value)) { 4693 pr_debug_ratelimited( 4694 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4695 __func__, j, h.index, h.value); 4696 goto vmabort; 4697 } 4698 } 4699 } 4700 4701 return; 4702 4703 vmabort: 4704 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4705 } 4706 4707 /* 4708 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4709 * and modify vmcs12 to make it see what it would expect to see there if 4710 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4711 */ 4712 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4713 u32 exit_intr_info, unsigned long exit_qualification) 4714 { 4715 struct vcpu_vmx *vmx = to_vmx(vcpu); 4716 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4717 4718 /* Pending MTF traps are discarded on VM-Exit. */ 4719 vmx->nested.mtf_pending = false; 4720 4721 /* trying to cancel vmlaunch/vmresume is a bug */ 4722 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4723 4724 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4725 /* 4726 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4727 * Enlightened VMCS after migration and we still need to 4728 * do that when something is forcing L2->L1 exit prior to 4729 * the first L2 run. 4730 */ 4731 (void)nested_get_evmcs_page(vcpu); 4732 } 4733 4734 /* Service pending TLB flush requests for L2 before switching to L1. */ 4735 kvm_service_local_tlb_flush_requests(vcpu); 4736 4737 /* 4738 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4739 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4740 * up-to-date before switching to L1. 4741 */ 4742 if (enable_ept && is_pae_paging(vcpu)) 4743 vmx_ept_load_pdptrs(vcpu); 4744 4745 leave_guest_mode(vcpu); 4746 4747 if (nested_cpu_has_preemption_timer(vmcs12)) 4748 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4749 4750 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4751 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4752 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4753 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4754 } 4755 4756 if (likely(!vmx->fail)) { 4757 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4758 4759 if (vm_exit_reason != -1) 4760 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4761 exit_intr_info, exit_qualification); 4762 4763 /* 4764 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4765 * also be used to capture vmcs12 cache as part of 4766 * capturing nVMX state for snapshot (migration). 4767 * 4768 * Otherwise, this flush will dirty guest memory at a 4769 * point it is already assumed by user-space to be 4770 * immutable. 4771 */ 4772 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4773 } else { 4774 /* 4775 * The only expected VM-instruction error is "VM entry with 4776 * invalid control field(s)." Anything else indicates a 4777 * problem with L0. And we should never get here with a 4778 * VMFail of any type if early consistency checks are enabled. 4779 */ 4780 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4781 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4782 WARN_ON_ONCE(nested_early_check); 4783 } 4784 4785 /* 4786 * Drop events/exceptions that were queued for re-injection to L2 4787 * (picked up via vmx_complete_interrupts()), as well as exceptions 4788 * that were pending for L2. Note, this must NOT be hoisted above 4789 * prepare_vmcs12(), events/exceptions queued for re-injection need to 4790 * be captured in vmcs12 (see vmcs12_save_pending_event()). 4791 */ 4792 vcpu->arch.nmi_injected = false; 4793 kvm_clear_exception_queue(vcpu); 4794 kvm_clear_interrupt_queue(vcpu); 4795 4796 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4797 4798 /* 4799 * If IBRS is advertised to the vCPU, KVM must flush the indirect 4800 * branch predictors when transitioning from L2 to L1, as L1 expects 4801 * hardware (KVM in this case) to provide separate predictor modes. 4802 * Bare metal isolates VMX root (host) from VMX non-root (guest), but 4803 * doesn't isolate different VMCSs, i.e. in this case, doesn't provide 4804 * separate modes for L2 vs L1. 4805 */ 4806 if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 4807 indirect_branch_prediction_barrier(); 4808 4809 /* Update any VMCS fields that might have changed while L2 ran */ 4810 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4811 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4812 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4813 if (kvm_caps.has_tsc_control) 4814 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4815 4816 if (vmx->nested.l1_tpr_threshold != -1) 4817 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4818 4819 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4820 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4821 vmx_set_virtual_apic_mode(vcpu); 4822 } 4823 4824 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4825 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4826 vmx_update_cpu_dirty_logging(vcpu); 4827 } 4828 4829 /* Unpin physical memory we referred to in vmcs02 */ 4830 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 4831 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4832 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4833 vmx->nested.pi_desc = NULL; 4834 4835 if (vmx->nested.reload_vmcs01_apic_access_page) { 4836 vmx->nested.reload_vmcs01_apic_access_page = false; 4837 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4838 } 4839 4840 if (vmx->nested.update_vmcs01_apicv_status) { 4841 vmx->nested.update_vmcs01_apicv_status = false; 4842 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 4843 } 4844 4845 if ((vm_exit_reason != -1) && 4846 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) 4847 vmx->nested.need_vmcs12_to_shadow_sync = true; 4848 4849 /* in case we halted in L2 */ 4850 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4851 4852 if (likely(!vmx->fail)) { 4853 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4854 nested_exit_intr_ack_set(vcpu)) { 4855 int irq = kvm_cpu_get_interrupt(vcpu); 4856 WARN_ON(irq < 0); 4857 vmcs12->vm_exit_intr_info = irq | 4858 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4859 } 4860 4861 if (vm_exit_reason != -1) 4862 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4863 vmcs12->exit_qualification, 4864 vmcs12->idt_vectoring_info_field, 4865 vmcs12->vm_exit_intr_info, 4866 vmcs12->vm_exit_intr_error_code, 4867 KVM_ISA_VMX); 4868 4869 load_vmcs12_host_state(vcpu, vmcs12); 4870 4871 return; 4872 } 4873 4874 /* 4875 * After an early L2 VM-entry failure, we're now back 4876 * in L1 which thinks it just finished a VMLAUNCH or 4877 * VMRESUME instruction, so we need to set the failure 4878 * flag and the VM-instruction error field of the VMCS 4879 * accordingly, and skip the emulated instruction. 4880 */ 4881 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4882 4883 /* 4884 * Restore L1's host state to KVM's software model. We're here 4885 * because a consistency check was caught by hardware, which 4886 * means some amount of guest state has been propagated to KVM's 4887 * model and needs to be unwound to the host's state. 4888 */ 4889 nested_vmx_restore_host_state(vcpu); 4890 4891 vmx->fail = 0; 4892 } 4893 4894 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4895 { 4896 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4897 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4898 } 4899 4900 /* 4901 * Decode the memory-address operand of a vmx instruction, as recorded on an 4902 * exit caused by such an instruction (run by a guest hypervisor). 4903 * On success, returns 0. When the operand is invalid, returns 1 and throws 4904 * #UD, #GP, or #SS. 4905 */ 4906 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4907 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4908 { 4909 gva_t off; 4910 bool exn; 4911 struct kvm_segment s; 4912 4913 /* 4914 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4915 * Execution", on an exit, vmx_instruction_info holds most of the 4916 * addressing components of the operand. Only the displacement part 4917 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4918 * For how an actual address is calculated from all these components, 4919 * refer to Vol. 1, "Operand Addressing". 4920 */ 4921 int scaling = vmx_instruction_info & 3; 4922 int addr_size = (vmx_instruction_info >> 7) & 7; 4923 bool is_reg = vmx_instruction_info & (1u << 10); 4924 int seg_reg = (vmx_instruction_info >> 15) & 7; 4925 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4926 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4927 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4928 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4929 4930 if (is_reg) { 4931 kvm_queue_exception(vcpu, UD_VECTOR); 4932 return 1; 4933 } 4934 4935 /* Addr = segment_base + offset */ 4936 /* offset = base + [index * scale] + displacement */ 4937 off = exit_qualification; /* holds the displacement */ 4938 if (addr_size == 1) 4939 off = (gva_t)sign_extend64(off, 31); 4940 else if (addr_size == 0) 4941 off = (gva_t)sign_extend64(off, 15); 4942 if (base_is_valid) 4943 off += kvm_register_read(vcpu, base_reg); 4944 if (index_is_valid) 4945 off += kvm_register_read(vcpu, index_reg) << scaling; 4946 vmx_get_segment(vcpu, &s, seg_reg); 4947 4948 /* 4949 * The effective address, i.e. @off, of a memory operand is truncated 4950 * based on the address size of the instruction. Note that this is 4951 * the *effective address*, i.e. the address prior to accounting for 4952 * the segment's base. 4953 */ 4954 if (addr_size == 1) /* 32 bit */ 4955 off &= 0xffffffff; 4956 else if (addr_size == 0) /* 16 bit */ 4957 off &= 0xffff; 4958 4959 /* Checks for #GP/#SS exceptions. */ 4960 exn = false; 4961 if (is_long_mode(vcpu)) { 4962 /* 4963 * The virtual/linear address is never truncated in 64-bit 4964 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4965 * address when using FS/GS with a non-zero base. 4966 */ 4967 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4968 *ret = s.base + off; 4969 else 4970 *ret = off; 4971 4972 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4973 * non-canonical form. This is the only check on the memory 4974 * destination for long mode! 4975 */ 4976 exn = is_noncanonical_address(*ret, vcpu); 4977 } else { 4978 /* 4979 * When not in long mode, the virtual/linear address is 4980 * unconditionally truncated to 32 bits regardless of the 4981 * address size. 4982 */ 4983 *ret = (s.base + off) & 0xffffffff; 4984 4985 /* Protected mode: apply checks for segment validity in the 4986 * following order: 4987 * - segment type check (#GP(0) may be thrown) 4988 * - usability check (#GP(0)/#SS(0)) 4989 * - limit check (#GP(0)/#SS(0)) 4990 */ 4991 if (wr) 4992 /* #GP(0) if the destination operand is located in a 4993 * read-only data segment or any code segment. 4994 */ 4995 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4996 else 4997 /* #GP(0) if the source operand is located in an 4998 * execute-only code segment 4999 */ 5000 exn = ((s.type & 0xa) == 8); 5001 if (exn) { 5002 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5003 return 1; 5004 } 5005 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5006 */ 5007 exn = (s.unusable != 0); 5008 5009 /* 5010 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5011 * outside the segment limit. All CPUs that support VMX ignore 5012 * limit checks for flat segments, i.e. segments with base==0, 5013 * limit==0xffffffff and of type expand-up data or code. 5014 */ 5015 if (!(s.base == 0 && s.limit == 0xffffffff && 5016 ((s.type & 8) || !(s.type & 4)))) 5017 exn = exn || ((u64)off + len - 1 > s.limit); 5018 } 5019 if (exn) { 5020 kvm_queue_exception_e(vcpu, 5021 seg_reg == VCPU_SREG_SS ? 5022 SS_VECTOR : GP_VECTOR, 5023 0); 5024 return 1; 5025 } 5026 5027 return 0; 5028 } 5029 5030 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5031 int *ret) 5032 { 5033 gva_t gva; 5034 struct x86_exception e; 5035 int r; 5036 5037 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5038 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5039 sizeof(*vmpointer), &gva)) { 5040 *ret = 1; 5041 return -EINVAL; 5042 } 5043 5044 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5045 if (r != X86EMUL_CONTINUE) { 5046 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5047 return -EINVAL; 5048 } 5049 5050 return 0; 5051 } 5052 5053 /* 5054 * Allocate a shadow VMCS and associate it with the currently loaded 5055 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5056 * VMCS is also VMCLEARed, so that it is ready for use. 5057 */ 5058 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5059 { 5060 struct vcpu_vmx *vmx = to_vmx(vcpu); 5061 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5062 5063 /* 5064 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5065 * when L1 executes VMXOFF or the vCPU is forced out of nested 5066 * operation. VMXON faults if the CPU is already post-VMXON, so it 5067 * should be impossible to already have an allocated shadow VMCS. KVM 5068 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5069 * always be the loaded VMCS. 5070 */ 5071 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5072 return loaded_vmcs->shadow_vmcs; 5073 5074 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5075 if (loaded_vmcs->shadow_vmcs) 5076 vmcs_clear(loaded_vmcs->shadow_vmcs); 5077 5078 return loaded_vmcs->shadow_vmcs; 5079 } 5080 5081 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5082 { 5083 struct vcpu_vmx *vmx = to_vmx(vcpu); 5084 int r; 5085 5086 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5087 if (r < 0) 5088 goto out_vmcs02; 5089 5090 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5091 if (!vmx->nested.cached_vmcs12) 5092 goto out_cached_vmcs12; 5093 5094 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5095 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5096 if (!vmx->nested.cached_shadow_vmcs12) 5097 goto out_cached_shadow_vmcs12; 5098 5099 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5100 goto out_shadow_vmcs; 5101 5102 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 5103 HRTIMER_MODE_ABS_PINNED); 5104 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 5105 5106 vmx->nested.vpid02 = allocate_vpid(); 5107 5108 vmx->nested.vmcs02_initialized = false; 5109 vmx->nested.vmxon = true; 5110 5111 if (vmx_pt_mode_is_host_guest()) { 5112 vmx->pt_desc.guest.ctl = 0; 5113 pt_update_intercept_for_msr(vcpu); 5114 } 5115 5116 return 0; 5117 5118 out_shadow_vmcs: 5119 kfree(vmx->nested.cached_shadow_vmcs12); 5120 5121 out_cached_shadow_vmcs12: 5122 kfree(vmx->nested.cached_vmcs12); 5123 5124 out_cached_vmcs12: 5125 free_loaded_vmcs(&vmx->nested.vmcs02); 5126 5127 out_vmcs02: 5128 return -ENOMEM; 5129 } 5130 5131 /* Emulate the VMXON instruction. */ 5132 static int handle_vmxon(struct kvm_vcpu *vcpu) 5133 { 5134 int ret; 5135 gpa_t vmptr; 5136 uint32_t revision; 5137 struct vcpu_vmx *vmx = to_vmx(vcpu); 5138 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5139 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5140 5141 /* 5142 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5143 * the guest and so cannot rely on hardware to perform the check, 5144 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5145 * for VMXON). 5146 * 5147 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5148 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5149 * force any of the relevant guest state. For a restricted guest, KVM 5150 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5151 * Real Mode, and so there's no need to check CR0.PE manually. 5152 */ 5153 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 5154 kvm_queue_exception(vcpu, UD_VECTOR); 5155 return 1; 5156 } 5157 5158 /* 5159 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5160 * and has higher priority than the VM-Fail due to being post-VMXON, 5161 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5162 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5163 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5164 * VMX non-root. 5165 * 5166 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5167 * #UD checks (see above), is functionally ok because KVM doesn't allow 5168 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5169 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5170 * missed by hardware due to shadowing CR0 and/or CR4. 5171 */ 5172 if (vmx_get_cpl(vcpu)) { 5173 kvm_inject_gp(vcpu, 0); 5174 return 1; 5175 } 5176 5177 if (vmx->nested.vmxon) 5178 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5179 5180 /* 5181 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5182 * only if the vCPU isn't already in VMX operation, i.e. effectively 5183 * have lower priority than the VM-Fail above. 5184 */ 5185 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5186 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5187 kvm_inject_gp(vcpu, 0); 5188 return 1; 5189 } 5190 5191 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5192 != VMXON_NEEDED_FEATURES) { 5193 kvm_inject_gp(vcpu, 0); 5194 return 1; 5195 } 5196 5197 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5198 return ret; 5199 5200 /* 5201 * SDM 3: 24.11.5 5202 * The first 4 bytes of VMXON region contain the supported 5203 * VMCS revision identifier 5204 * 5205 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5206 * which replaces physical address width with 32 5207 */ 5208 if (!page_address_valid(vcpu, vmptr)) 5209 return nested_vmx_failInvalid(vcpu); 5210 5211 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5212 revision != VMCS12_REVISION) 5213 return nested_vmx_failInvalid(vcpu); 5214 5215 vmx->nested.vmxon_ptr = vmptr; 5216 ret = enter_vmx_operation(vcpu); 5217 if (ret) 5218 return ret; 5219 5220 return nested_vmx_succeed(vcpu); 5221 } 5222 5223 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5224 { 5225 struct vcpu_vmx *vmx = to_vmx(vcpu); 5226 5227 if (vmx->nested.current_vmptr == INVALID_GPA) 5228 return; 5229 5230 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5231 5232 if (enable_shadow_vmcs) { 5233 /* copy to memory all shadowed fields in case 5234 they were modified */ 5235 copy_shadow_to_vmcs12(vmx); 5236 vmx_disable_shadow_vmcs(vmx); 5237 } 5238 vmx->nested.posted_intr_nv = -1; 5239 5240 /* Flush VMCS12 to guest memory */ 5241 kvm_vcpu_write_guest_page(vcpu, 5242 vmx->nested.current_vmptr >> PAGE_SHIFT, 5243 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5244 5245 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5246 5247 vmx->nested.current_vmptr = INVALID_GPA; 5248 } 5249 5250 /* Emulate the VMXOFF instruction */ 5251 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5252 { 5253 if (!nested_vmx_check_permission(vcpu)) 5254 return 1; 5255 5256 free_nested(vcpu); 5257 5258 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5259 kvm_make_request(KVM_REQ_EVENT, vcpu); 5260 5261 return nested_vmx_succeed(vcpu); 5262 } 5263 5264 /* Emulate the VMCLEAR instruction */ 5265 static int handle_vmclear(struct kvm_vcpu *vcpu) 5266 { 5267 struct vcpu_vmx *vmx = to_vmx(vcpu); 5268 u32 zero = 0; 5269 gpa_t vmptr; 5270 int r; 5271 5272 if (!nested_vmx_check_permission(vcpu)) 5273 return 1; 5274 5275 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5276 return r; 5277 5278 if (!page_address_valid(vcpu, vmptr)) 5279 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5280 5281 if (vmptr == vmx->nested.vmxon_ptr) 5282 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5283 5284 /* 5285 * When Enlightened VMEntry is enabled on the calling CPU we treat 5286 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 5287 * way to distinguish it from VMCS12) and we must not corrupt it by 5288 * writing to the non-existent 'launch_state' field. The area doesn't 5289 * have to be the currently active EVMCS on the calling CPU and there's 5290 * nothing KVM has to do to transition it from 'active' to 'non-active' 5291 * state. It is possible that the area will stay mapped as 5292 * vmx->nested.hv_evmcs but this shouldn't be a problem. 5293 */ 5294 if (likely(!guest_cpuid_has_evmcs(vcpu) || 5295 !evmptr_is_valid(nested_get_evmptr(vcpu)))) { 5296 if (vmptr == vmx->nested.current_vmptr) 5297 nested_release_vmcs12(vcpu); 5298 5299 /* 5300 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5301 * for VMCLEAR includes a "ensure that data for VMCS referenced 5302 * by the operand is in memory" clause that guards writes to 5303 * memory, i.e. doing nothing for I/O is architecturally valid. 5304 * 5305 * FIXME: Suppress failures if and only if no memslot is found, 5306 * i.e. exit to userspace if __copy_to_user() fails. 5307 */ 5308 (void)kvm_vcpu_write_guest(vcpu, 5309 vmptr + offsetof(struct vmcs12, 5310 launch_state), 5311 &zero, sizeof(zero)); 5312 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { 5313 nested_release_evmcs(vcpu); 5314 } 5315 5316 return nested_vmx_succeed(vcpu); 5317 } 5318 5319 /* Emulate the VMLAUNCH instruction */ 5320 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5321 { 5322 return nested_vmx_run(vcpu, true); 5323 } 5324 5325 /* Emulate the VMRESUME instruction */ 5326 static int handle_vmresume(struct kvm_vcpu *vcpu) 5327 { 5328 5329 return nested_vmx_run(vcpu, false); 5330 } 5331 5332 static int handle_vmread(struct kvm_vcpu *vcpu) 5333 { 5334 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5335 : get_vmcs12(vcpu); 5336 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5337 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5338 struct vcpu_vmx *vmx = to_vmx(vcpu); 5339 struct x86_exception e; 5340 unsigned long field; 5341 u64 value; 5342 gva_t gva = 0; 5343 short offset; 5344 int len, r; 5345 5346 if (!nested_vmx_check_permission(vcpu)) 5347 return 1; 5348 5349 /* Decode instruction info and find the field to read */ 5350 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5351 5352 if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 5353 /* 5354 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5355 * any VMREAD sets the ALU flags for VMfailInvalid. 5356 */ 5357 if (vmx->nested.current_vmptr == INVALID_GPA || 5358 (is_guest_mode(vcpu) && 5359 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5360 return nested_vmx_failInvalid(vcpu); 5361 5362 offset = get_vmcs12_field_offset(field); 5363 if (offset < 0) 5364 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5365 5366 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5367 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5368 5369 /* Read the field, zero-extended to a u64 value */ 5370 value = vmcs12_read_any(vmcs12, field, offset); 5371 } else { 5372 /* 5373 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5374 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5375 * unsupported. Unfortunately, certain versions of Windows 11 5376 * don't comply with this requirement which is not enforced in 5377 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5378 * workaround, as misbehaving guests will panic on VM-Fail. 5379 * Note, enlightened VMCS is incompatible with shadow VMCS so 5380 * all VMREADs from L2 should go to L1. 5381 */ 5382 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5383 return nested_vmx_failInvalid(vcpu); 5384 5385 offset = evmcs_field_offset(field, NULL); 5386 if (offset < 0) 5387 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5388 5389 /* Read the field, zero-extended to a u64 value */ 5390 value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); 5391 } 5392 5393 /* 5394 * Now copy part of this value to register or memory, as requested. 5395 * Note that the number of bits actually copied is 32 or 64 depending 5396 * on the guest's mode (32 or 64 bit), not on the given field's length. 5397 */ 5398 if (instr_info & BIT(10)) { 5399 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5400 } else { 5401 len = is_64_bit_mode(vcpu) ? 8 : 4; 5402 if (get_vmx_mem_address(vcpu, exit_qualification, 5403 instr_info, true, len, &gva)) 5404 return 1; 5405 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5406 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5407 if (r != X86EMUL_CONTINUE) 5408 return kvm_handle_memory_failure(vcpu, r, &e); 5409 } 5410 5411 return nested_vmx_succeed(vcpu); 5412 } 5413 5414 static bool is_shadow_field_rw(unsigned long field) 5415 { 5416 switch (field) { 5417 #define SHADOW_FIELD_RW(x, y) case x: 5418 #include "vmcs_shadow_fields.h" 5419 return true; 5420 default: 5421 break; 5422 } 5423 return false; 5424 } 5425 5426 static bool is_shadow_field_ro(unsigned long field) 5427 { 5428 switch (field) { 5429 #define SHADOW_FIELD_RO(x, y) case x: 5430 #include "vmcs_shadow_fields.h" 5431 return true; 5432 default: 5433 break; 5434 } 5435 return false; 5436 } 5437 5438 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5439 { 5440 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5441 : get_vmcs12(vcpu); 5442 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5443 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5444 struct vcpu_vmx *vmx = to_vmx(vcpu); 5445 struct x86_exception e; 5446 unsigned long field; 5447 short offset; 5448 gva_t gva; 5449 int len, r; 5450 5451 /* 5452 * The value to write might be 32 or 64 bits, depending on L1's long 5453 * mode, and eventually we need to write that into a field of several 5454 * possible lengths. The code below first zero-extends the value to 64 5455 * bit (value), and then copies only the appropriate number of 5456 * bits into the vmcs12 field. 5457 */ 5458 u64 value = 0; 5459 5460 if (!nested_vmx_check_permission(vcpu)) 5461 return 1; 5462 5463 /* 5464 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5465 * any VMWRITE sets the ALU flags for VMfailInvalid. 5466 */ 5467 if (vmx->nested.current_vmptr == INVALID_GPA || 5468 (is_guest_mode(vcpu) && 5469 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5470 return nested_vmx_failInvalid(vcpu); 5471 5472 if (instr_info & BIT(10)) 5473 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5474 else { 5475 len = is_64_bit_mode(vcpu) ? 8 : 4; 5476 if (get_vmx_mem_address(vcpu, exit_qualification, 5477 instr_info, false, len, &gva)) 5478 return 1; 5479 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5480 if (r != X86EMUL_CONTINUE) 5481 return kvm_handle_memory_failure(vcpu, r, &e); 5482 } 5483 5484 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5485 5486 offset = get_vmcs12_field_offset(field); 5487 if (offset < 0) 5488 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5489 5490 /* 5491 * If the vCPU supports "VMWRITE to any supported field in the 5492 * VMCS," then the "read-only" fields are actually read/write. 5493 */ 5494 if (vmcs_field_readonly(field) && 5495 !nested_cpu_has_vmwrite_any_field(vcpu)) 5496 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5497 5498 /* 5499 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5500 * vmcs12, else we may crush a field or consume a stale value. 5501 */ 5502 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5503 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5504 5505 /* 5506 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5507 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5508 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5509 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5510 * from L1 will return a different value than VMREAD from L2 (L1 sees 5511 * the stripped down value, L2 sees the full value as stored by KVM). 5512 */ 5513 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5514 value &= 0x1f0ff; 5515 5516 vmcs12_write_any(vmcs12, field, offset, value); 5517 5518 /* 5519 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5520 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5521 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5522 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5523 */ 5524 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5525 /* 5526 * L1 can read these fields without exiting, ensure the 5527 * shadow VMCS is up-to-date. 5528 */ 5529 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5530 preempt_disable(); 5531 vmcs_load(vmx->vmcs01.shadow_vmcs); 5532 5533 __vmcs_writel(field, value); 5534 5535 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5536 vmcs_load(vmx->loaded_vmcs->vmcs); 5537 preempt_enable(); 5538 } 5539 vmx->nested.dirty_vmcs12 = true; 5540 } 5541 5542 return nested_vmx_succeed(vcpu); 5543 } 5544 5545 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5546 { 5547 vmx->nested.current_vmptr = vmptr; 5548 if (enable_shadow_vmcs) { 5549 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5550 vmcs_write64(VMCS_LINK_POINTER, 5551 __pa(vmx->vmcs01.shadow_vmcs)); 5552 vmx->nested.need_vmcs12_to_shadow_sync = true; 5553 } 5554 vmx->nested.dirty_vmcs12 = true; 5555 vmx->nested.force_msr_bitmap_recalc = true; 5556 } 5557 5558 /* Emulate the VMPTRLD instruction */ 5559 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5560 { 5561 struct vcpu_vmx *vmx = to_vmx(vcpu); 5562 gpa_t vmptr; 5563 int r; 5564 5565 if (!nested_vmx_check_permission(vcpu)) 5566 return 1; 5567 5568 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5569 return r; 5570 5571 if (!page_address_valid(vcpu, vmptr)) 5572 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5573 5574 if (vmptr == vmx->nested.vmxon_ptr) 5575 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5576 5577 /* Forbid normal VMPTRLD if Enlightened version was used */ 5578 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 5579 return 1; 5580 5581 if (vmx->nested.current_vmptr != vmptr) { 5582 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5583 struct vmcs_hdr hdr; 5584 5585 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5586 /* 5587 * Reads from an unbacked page return all 1s, 5588 * which means that the 32 bits located at the 5589 * given physical address won't match the required 5590 * VMCS12_REVISION identifier. 5591 */ 5592 return nested_vmx_fail(vcpu, 5593 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5594 } 5595 5596 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5597 offsetof(struct vmcs12, hdr), 5598 sizeof(hdr))) { 5599 return nested_vmx_fail(vcpu, 5600 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5601 } 5602 5603 if (hdr.revision_id != VMCS12_REVISION || 5604 (hdr.shadow_vmcs && 5605 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5606 return nested_vmx_fail(vcpu, 5607 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5608 } 5609 5610 nested_release_vmcs12(vcpu); 5611 5612 /* 5613 * Load VMCS12 from guest memory since it is not already 5614 * cached. 5615 */ 5616 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5617 VMCS12_SIZE)) { 5618 return nested_vmx_fail(vcpu, 5619 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5620 } 5621 5622 set_current_vmptr(vmx, vmptr); 5623 } 5624 5625 return nested_vmx_succeed(vcpu); 5626 } 5627 5628 /* Emulate the VMPTRST instruction */ 5629 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5630 { 5631 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5632 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5633 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5634 struct x86_exception e; 5635 gva_t gva; 5636 int r; 5637 5638 if (!nested_vmx_check_permission(vcpu)) 5639 return 1; 5640 5641 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) 5642 return 1; 5643 5644 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5645 true, sizeof(gpa_t), &gva)) 5646 return 1; 5647 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5648 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5649 sizeof(gpa_t), &e); 5650 if (r != X86EMUL_CONTINUE) 5651 return kvm_handle_memory_failure(vcpu, r, &e); 5652 5653 return nested_vmx_succeed(vcpu); 5654 } 5655 5656 /* Emulate the INVEPT instruction */ 5657 static int handle_invept(struct kvm_vcpu *vcpu) 5658 { 5659 struct vcpu_vmx *vmx = to_vmx(vcpu); 5660 u32 vmx_instruction_info, types; 5661 unsigned long type, roots_to_free; 5662 struct kvm_mmu *mmu; 5663 gva_t gva; 5664 struct x86_exception e; 5665 struct { 5666 u64 eptp, gpa; 5667 } operand; 5668 int i, r, gpr_index; 5669 5670 if (!(vmx->nested.msrs.secondary_ctls_high & 5671 SECONDARY_EXEC_ENABLE_EPT) || 5672 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5673 kvm_queue_exception(vcpu, UD_VECTOR); 5674 return 1; 5675 } 5676 5677 if (!nested_vmx_check_permission(vcpu)) 5678 return 1; 5679 5680 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5681 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5682 type = kvm_register_read(vcpu, gpr_index); 5683 5684 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5685 5686 if (type >= 32 || !(types & (1 << type))) 5687 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5688 5689 /* According to the Intel VMX instruction reference, the memory 5690 * operand is read even if it isn't needed (e.g., for type==global) 5691 */ 5692 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5693 vmx_instruction_info, false, sizeof(operand), &gva)) 5694 return 1; 5695 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5696 if (r != X86EMUL_CONTINUE) 5697 return kvm_handle_memory_failure(vcpu, r, &e); 5698 5699 /* 5700 * Nested EPT roots are always held through guest_mmu, 5701 * not root_mmu. 5702 */ 5703 mmu = &vcpu->arch.guest_mmu; 5704 5705 switch (type) { 5706 case VMX_EPT_EXTENT_CONTEXT: 5707 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5708 return nested_vmx_fail(vcpu, 5709 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5710 5711 roots_to_free = 0; 5712 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5713 operand.eptp)) 5714 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5715 5716 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5717 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5718 mmu->prev_roots[i].pgd, 5719 operand.eptp)) 5720 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5721 } 5722 break; 5723 case VMX_EPT_EXTENT_GLOBAL: 5724 roots_to_free = KVM_MMU_ROOTS_ALL; 5725 break; 5726 default: 5727 BUG(); 5728 break; 5729 } 5730 5731 if (roots_to_free) 5732 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5733 5734 return nested_vmx_succeed(vcpu); 5735 } 5736 5737 static int handle_invvpid(struct kvm_vcpu *vcpu) 5738 { 5739 struct vcpu_vmx *vmx = to_vmx(vcpu); 5740 u32 vmx_instruction_info; 5741 unsigned long type, types; 5742 gva_t gva; 5743 struct x86_exception e; 5744 struct { 5745 u64 vpid; 5746 u64 gla; 5747 } operand; 5748 u16 vpid02; 5749 int r, gpr_index; 5750 5751 if (!(vmx->nested.msrs.secondary_ctls_high & 5752 SECONDARY_EXEC_ENABLE_VPID) || 5753 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5754 kvm_queue_exception(vcpu, UD_VECTOR); 5755 return 1; 5756 } 5757 5758 if (!nested_vmx_check_permission(vcpu)) 5759 return 1; 5760 5761 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5762 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5763 type = kvm_register_read(vcpu, gpr_index); 5764 5765 types = (vmx->nested.msrs.vpid_caps & 5766 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5767 5768 if (type >= 32 || !(types & (1 << type))) 5769 return nested_vmx_fail(vcpu, 5770 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5771 5772 /* according to the intel vmx instruction reference, the memory 5773 * operand is read even if it isn't needed (e.g., for type==global) 5774 */ 5775 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5776 vmx_instruction_info, false, sizeof(operand), &gva)) 5777 return 1; 5778 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5779 if (r != X86EMUL_CONTINUE) 5780 return kvm_handle_memory_failure(vcpu, r, &e); 5781 5782 if (operand.vpid >> 16) 5783 return nested_vmx_fail(vcpu, 5784 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5785 5786 vpid02 = nested_get_vpid02(vcpu); 5787 switch (type) { 5788 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5789 if (!operand.vpid || 5790 is_noncanonical_address(operand.gla, vcpu)) 5791 return nested_vmx_fail(vcpu, 5792 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5793 vpid_sync_vcpu_addr(vpid02, operand.gla); 5794 break; 5795 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5796 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5797 if (!operand.vpid) 5798 return nested_vmx_fail(vcpu, 5799 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5800 vpid_sync_context(vpid02); 5801 break; 5802 case VMX_VPID_EXTENT_ALL_CONTEXT: 5803 vpid_sync_context(vpid02); 5804 break; 5805 default: 5806 WARN_ON_ONCE(1); 5807 return kvm_skip_emulated_instruction(vcpu); 5808 } 5809 5810 /* 5811 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5812 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5813 * roots as VPIDs are not tracked in the MMU role. 5814 * 5815 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5816 * an MMU when EPT is disabled. 5817 * 5818 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5819 */ 5820 if (!enable_ept) 5821 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 5822 5823 return nested_vmx_succeed(vcpu); 5824 } 5825 5826 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5827 struct vmcs12 *vmcs12) 5828 { 5829 u32 index = kvm_rcx_read(vcpu); 5830 u64 new_eptp; 5831 5832 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5833 return 1; 5834 if (index >= VMFUNC_EPTP_ENTRIES) 5835 return 1; 5836 5837 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5838 &new_eptp, index * 8, 8)) 5839 return 1; 5840 5841 /* 5842 * If the (L2) guest does a vmfunc to the currently 5843 * active ept pointer, we don't have to do anything else 5844 */ 5845 if (vmcs12->ept_pointer != new_eptp) { 5846 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5847 return 1; 5848 5849 vmcs12->ept_pointer = new_eptp; 5850 nested_ept_new_eptp(vcpu); 5851 5852 if (!nested_cpu_has_vpid(vmcs12)) 5853 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5854 } 5855 5856 return 0; 5857 } 5858 5859 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5860 { 5861 struct vcpu_vmx *vmx = to_vmx(vcpu); 5862 struct vmcs12 *vmcs12; 5863 u32 function = kvm_rax_read(vcpu); 5864 5865 /* 5866 * VMFUNC is only supported for nested guests, but we always enable the 5867 * secondary control for simplicity; for non-nested mode, fake that we 5868 * didn't by injecting #UD. 5869 */ 5870 if (!is_guest_mode(vcpu)) { 5871 kvm_queue_exception(vcpu, UD_VECTOR); 5872 return 1; 5873 } 5874 5875 vmcs12 = get_vmcs12(vcpu); 5876 5877 /* 5878 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5879 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5880 */ 5881 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5882 kvm_queue_exception(vcpu, UD_VECTOR); 5883 return 1; 5884 } 5885 5886 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5887 goto fail; 5888 5889 switch (function) { 5890 case 0: 5891 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5892 goto fail; 5893 break; 5894 default: 5895 goto fail; 5896 } 5897 return kvm_skip_emulated_instruction(vcpu); 5898 5899 fail: 5900 /* 5901 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5902 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5903 * EXIT_REASON_VMFUNC as the exit reason. 5904 */ 5905 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5906 vmx_get_intr_info(vcpu), 5907 vmx_get_exit_qual(vcpu)); 5908 return 1; 5909 } 5910 5911 /* 5912 * Return true if an IO instruction with the specified port and size should cause 5913 * a VM-exit into L1. 5914 */ 5915 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5916 int size) 5917 { 5918 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5919 gpa_t bitmap, last_bitmap; 5920 u8 b; 5921 5922 last_bitmap = INVALID_GPA; 5923 b = -1; 5924 5925 while (size > 0) { 5926 if (port < 0x8000) 5927 bitmap = vmcs12->io_bitmap_a; 5928 else if (port < 0x10000) 5929 bitmap = vmcs12->io_bitmap_b; 5930 else 5931 return true; 5932 bitmap += (port & 0x7fff) / 8; 5933 5934 if (last_bitmap != bitmap) 5935 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5936 return true; 5937 if (b & (1 << (port & 7))) 5938 return true; 5939 5940 port++; 5941 size--; 5942 last_bitmap = bitmap; 5943 } 5944 5945 return false; 5946 } 5947 5948 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5949 struct vmcs12 *vmcs12) 5950 { 5951 unsigned long exit_qualification; 5952 unsigned short port; 5953 int size; 5954 5955 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5956 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5957 5958 exit_qualification = vmx_get_exit_qual(vcpu); 5959 5960 port = exit_qualification >> 16; 5961 size = (exit_qualification & 7) + 1; 5962 5963 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5964 } 5965 5966 /* 5967 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5968 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5969 * disinterest in the current event (read or write a specific MSR) by using an 5970 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5971 */ 5972 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5973 struct vmcs12 *vmcs12, 5974 union vmx_exit_reason exit_reason) 5975 { 5976 u32 msr_index = kvm_rcx_read(vcpu); 5977 gpa_t bitmap; 5978 5979 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5980 return true; 5981 5982 /* 5983 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5984 * for the four combinations of read/write and low/high MSR numbers. 5985 * First we need to figure out which of the four to use: 5986 */ 5987 bitmap = vmcs12->msr_bitmap; 5988 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5989 bitmap += 2048; 5990 if (msr_index >= 0xc0000000) { 5991 msr_index -= 0xc0000000; 5992 bitmap += 1024; 5993 } 5994 5995 /* Then read the msr_index'th bit from this bitmap: */ 5996 if (msr_index < 1024*8) { 5997 unsigned char b; 5998 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5999 return true; 6000 return 1 & (b >> (msr_index & 7)); 6001 } else 6002 return true; /* let L1 handle the wrong parameter */ 6003 } 6004 6005 /* 6006 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6007 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6008 * intercept (via guest_host_mask etc.) the current event. 6009 */ 6010 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6011 struct vmcs12 *vmcs12) 6012 { 6013 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6014 int cr = exit_qualification & 15; 6015 int reg; 6016 unsigned long val; 6017 6018 switch ((exit_qualification >> 4) & 3) { 6019 case 0: /* mov to cr */ 6020 reg = (exit_qualification >> 8) & 15; 6021 val = kvm_register_read(vcpu, reg); 6022 switch (cr) { 6023 case 0: 6024 if (vmcs12->cr0_guest_host_mask & 6025 (val ^ vmcs12->cr0_read_shadow)) 6026 return true; 6027 break; 6028 case 3: 6029 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6030 return true; 6031 break; 6032 case 4: 6033 if (vmcs12->cr4_guest_host_mask & 6034 (vmcs12->cr4_read_shadow ^ val)) 6035 return true; 6036 break; 6037 case 8: 6038 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6039 return true; 6040 break; 6041 } 6042 break; 6043 case 2: /* clts */ 6044 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6045 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6046 return true; 6047 break; 6048 case 1: /* mov from cr */ 6049 switch (cr) { 6050 case 3: 6051 if (vmcs12->cpu_based_vm_exec_control & 6052 CPU_BASED_CR3_STORE_EXITING) 6053 return true; 6054 break; 6055 case 8: 6056 if (vmcs12->cpu_based_vm_exec_control & 6057 CPU_BASED_CR8_STORE_EXITING) 6058 return true; 6059 break; 6060 } 6061 break; 6062 case 3: /* lmsw */ 6063 /* 6064 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6065 * cr0. Other attempted changes are ignored, with no exit. 6066 */ 6067 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6068 if (vmcs12->cr0_guest_host_mask & 0xe & 6069 (val ^ vmcs12->cr0_read_shadow)) 6070 return true; 6071 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6072 !(vmcs12->cr0_read_shadow & 0x1) && 6073 (val & 0x1)) 6074 return true; 6075 break; 6076 } 6077 return false; 6078 } 6079 6080 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6081 struct vmcs12 *vmcs12) 6082 { 6083 u32 encls_leaf; 6084 6085 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 6086 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6087 return false; 6088 6089 encls_leaf = kvm_rax_read(vcpu); 6090 if (encls_leaf > 62) 6091 encls_leaf = 63; 6092 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6093 } 6094 6095 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6096 struct vmcs12 *vmcs12, gpa_t bitmap) 6097 { 6098 u32 vmx_instruction_info; 6099 unsigned long field; 6100 u8 b; 6101 6102 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6103 return true; 6104 6105 /* Decode instruction info and find the field to access */ 6106 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6107 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6108 6109 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6110 if (field >> 15) 6111 return true; 6112 6113 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6114 return true; 6115 6116 return 1 & (b >> (field & 7)); 6117 } 6118 6119 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6120 { 6121 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6122 6123 if (nested_cpu_has_mtf(vmcs12)) 6124 return true; 6125 6126 /* 6127 * An MTF VM-exit may be injected into the guest by setting the 6128 * interruption-type to 7 (other event) and the vector field to 0. Such 6129 * is the case regardless of the 'monitor trap flag' VM-execution 6130 * control. 6131 */ 6132 return entry_intr_info == (INTR_INFO_VALID_MASK 6133 | INTR_TYPE_OTHER_EVENT); 6134 } 6135 6136 /* 6137 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6138 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6139 */ 6140 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6141 union vmx_exit_reason exit_reason) 6142 { 6143 u32 intr_info; 6144 6145 switch ((u16)exit_reason.basic) { 6146 case EXIT_REASON_EXCEPTION_NMI: 6147 intr_info = vmx_get_intr_info(vcpu); 6148 if (is_nmi(intr_info)) 6149 return true; 6150 else if (is_page_fault(intr_info)) 6151 return vcpu->arch.apf.host_apf_flags || 6152 vmx_need_pf_intercept(vcpu); 6153 else if (is_debug(intr_info) && 6154 vcpu->guest_debug & 6155 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6156 return true; 6157 else if (is_breakpoint(intr_info) && 6158 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6159 return true; 6160 else if (is_alignment_check(intr_info) && 6161 !vmx_guest_inject_ac(vcpu)) 6162 return true; 6163 return false; 6164 case EXIT_REASON_EXTERNAL_INTERRUPT: 6165 return true; 6166 case EXIT_REASON_MCE_DURING_VMENTRY: 6167 return true; 6168 case EXIT_REASON_EPT_VIOLATION: 6169 /* 6170 * L0 always deals with the EPT violation. If nested EPT is 6171 * used, and the nested mmu code discovers that the address is 6172 * missing in the guest EPT table (EPT12), the EPT violation 6173 * will be injected with nested_ept_inject_page_fault() 6174 */ 6175 return true; 6176 case EXIT_REASON_EPT_MISCONFIG: 6177 /* 6178 * L2 never uses directly L1's EPT, but rather L0's own EPT 6179 * table (shadow on EPT) or a merged EPT table that L0 built 6180 * (EPT on EPT). So any problems with the structure of the 6181 * table is L0's fault. 6182 */ 6183 return true; 6184 case EXIT_REASON_PREEMPTION_TIMER: 6185 return true; 6186 case EXIT_REASON_PML_FULL: 6187 /* 6188 * PML is emulated for an L1 VMM and should never be enabled in 6189 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6190 */ 6191 return true; 6192 case EXIT_REASON_VMFUNC: 6193 /* VM functions are emulated through L2->L0 vmexits. */ 6194 return true; 6195 case EXIT_REASON_BUS_LOCK: 6196 /* 6197 * At present, bus lock VM exit is never exposed to L1. 6198 * Handle L2's bus locks in L0 directly. 6199 */ 6200 return true; 6201 case EXIT_REASON_VMCALL: 6202 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6203 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6204 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6205 kvm_hv_is_tlb_flush_hcall(vcpu); 6206 default: 6207 break; 6208 } 6209 return false; 6210 } 6211 6212 /* 6213 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6214 * is_guest_mode (L2). 6215 */ 6216 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6217 union vmx_exit_reason exit_reason) 6218 { 6219 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6220 u32 intr_info; 6221 6222 switch ((u16)exit_reason.basic) { 6223 case EXIT_REASON_EXCEPTION_NMI: 6224 intr_info = vmx_get_intr_info(vcpu); 6225 if (is_nmi(intr_info)) 6226 return true; 6227 else if (is_page_fault(intr_info)) 6228 return true; 6229 return vmcs12->exception_bitmap & 6230 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6231 case EXIT_REASON_EXTERNAL_INTERRUPT: 6232 return nested_exit_on_intr(vcpu); 6233 case EXIT_REASON_TRIPLE_FAULT: 6234 return true; 6235 case EXIT_REASON_INTERRUPT_WINDOW: 6236 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6237 case EXIT_REASON_NMI_WINDOW: 6238 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6239 case EXIT_REASON_TASK_SWITCH: 6240 return true; 6241 case EXIT_REASON_CPUID: 6242 return true; 6243 case EXIT_REASON_HLT: 6244 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6245 case EXIT_REASON_INVD: 6246 return true; 6247 case EXIT_REASON_INVLPG: 6248 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6249 case EXIT_REASON_RDPMC: 6250 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6251 case EXIT_REASON_RDRAND: 6252 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6253 case EXIT_REASON_RDSEED: 6254 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6255 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6256 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6257 case EXIT_REASON_VMREAD: 6258 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6259 vmcs12->vmread_bitmap); 6260 case EXIT_REASON_VMWRITE: 6261 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6262 vmcs12->vmwrite_bitmap); 6263 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6264 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6265 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6266 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6267 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6268 /* 6269 * VMX instructions trap unconditionally. This allows L1 to 6270 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6271 */ 6272 return true; 6273 case EXIT_REASON_CR_ACCESS: 6274 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6275 case EXIT_REASON_DR_ACCESS: 6276 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6277 case EXIT_REASON_IO_INSTRUCTION: 6278 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6279 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6280 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6281 case EXIT_REASON_MSR_READ: 6282 case EXIT_REASON_MSR_WRITE: 6283 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6284 case EXIT_REASON_INVALID_STATE: 6285 return true; 6286 case EXIT_REASON_MWAIT_INSTRUCTION: 6287 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6288 case EXIT_REASON_MONITOR_TRAP_FLAG: 6289 return nested_vmx_exit_handled_mtf(vmcs12); 6290 case EXIT_REASON_MONITOR_INSTRUCTION: 6291 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6292 case EXIT_REASON_PAUSE_INSTRUCTION: 6293 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6294 nested_cpu_has2(vmcs12, 6295 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6296 case EXIT_REASON_MCE_DURING_VMENTRY: 6297 return true; 6298 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6299 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6300 case EXIT_REASON_APIC_ACCESS: 6301 case EXIT_REASON_APIC_WRITE: 6302 case EXIT_REASON_EOI_INDUCED: 6303 /* 6304 * The controls for "virtualize APIC accesses," "APIC- 6305 * register virtualization," and "virtual-interrupt 6306 * delivery" only come from vmcs12. 6307 */ 6308 return true; 6309 case EXIT_REASON_INVPCID: 6310 return 6311 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6312 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6313 case EXIT_REASON_WBINVD: 6314 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6315 case EXIT_REASON_XSETBV: 6316 return true; 6317 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6318 /* 6319 * This should never happen, since it is not possible to 6320 * set XSS to a non-zero value---neither in L1 nor in L2. 6321 * If if it were, XSS would have to be checked against 6322 * the XSS exit bitmap in vmcs12. 6323 */ 6324 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 6325 case EXIT_REASON_UMWAIT: 6326 case EXIT_REASON_TPAUSE: 6327 return nested_cpu_has2(vmcs12, 6328 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6329 case EXIT_REASON_ENCLS: 6330 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6331 case EXIT_REASON_NOTIFY: 6332 /* Notify VM exit is not exposed to L1 */ 6333 return false; 6334 default: 6335 return true; 6336 } 6337 } 6338 6339 /* 6340 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6341 * reflected into L1. 6342 */ 6343 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6344 { 6345 struct vcpu_vmx *vmx = to_vmx(vcpu); 6346 union vmx_exit_reason exit_reason = vmx->exit_reason; 6347 unsigned long exit_qual; 6348 u32 exit_intr_info; 6349 6350 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6351 6352 /* 6353 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6354 * has already loaded L2's state. 6355 */ 6356 if (unlikely(vmx->fail)) { 6357 trace_kvm_nested_vmenter_failed( 6358 "hardware VM-instruction error: ", 6359 vmcs_read32(VM_INSTRUCTION_ERROR)); 6360 exit_intr_info = 0; 6361 exit_qual = 0; 6362 goto reflect_vmexit; 6363 } 6364 6365 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6366 6367 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6368 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6369 return false; 6370 6371 /* If L1 doesn't want the exit, handle it in L0. */ 6372 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6373 return false; 6374 6375 /* 6376 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6377 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6378 * need to be synthesized by querying the in-kernel LAPIC, but external 6379 * interrupts are never reflected to L1 so it's a non-issue. 6380 */ 6381 exit_intr_info = vmx_get_intr_info(vcpu); 6382 if (is_exception_with_error_code(exit_intr_info)) { 6383 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6384 6385 vmcs12->vm_exit_intr_error_code = 6386 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6387 } 6388 exit_qual = vmx_get_exit_qual(vcpu); 6389 6390 reflect_vmexit: 6391 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6392 return true; 6393 } 6394 6395 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6396 struct kvm_nested_state __user *user_kvm_nested_state, 6397 u32 user_data_size) 6398 { 6399 struct vcpu_vmx *vmx; 6400 struct vmcs12 *vmcs12; 6401 struct kvm_nested_state kvm_state = { 6402 .flags = 0, 6403 .format = KVM_STATE_NESTED_FORMAT_VMX, 6404 .size = sizeof(kvm_state), 6405 .hdr.vmx.flags = 0, 6406 .hdr.vmx.vmxon_pa = INVALID_GPA, 6407 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6408 .hdr.vmx.preemption_timer_deadline = 0, 6409 }; 6410 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6411 &user_kvm_nested_state->data.vmx[0]; 6412 6413 if (!vcpu) 6414 return kvm_state.size + sizeof(*user_vmx_nested_state); 6415 6416 vmx = to_vmx(vcpu); 6417 vmcs12 = get_vmcs12(vcpu); 6418 6419 if (nested_vmx_allowed(vcpu) && 6420 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6421 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6422 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6423 6424 if (vmx_has_valid_vmcs12(vcpu)) { 6425 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6426 6427 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6428 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 6429 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6430 6431 if (is_guest_mode(vcpu) && 6432 nested_cpu_has_shadow_vmcs(vmcs12) && 6433 vmcs12->vmcs_link_pointer != INVALID_GPA) 6434 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6435 } 6436 6437 if (vmx->nested.smm.vmxon) 6438 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6439 6440 if (vmx->nested.smm.guest_mode) 6441 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6442 6443 if (is_guest_mode(vcpu)) { 6444 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6445 6446 if (vmx->nested.nested_run_pending) 6447 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6448 6449 if (vmx->nested.mtf_pending) 6450 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6451 6452 if (nested_cpu_has_preemption_timer(vmcs12) && 6453 vmx->nested.has_preemption_timer_deadline) { 6454 kvm_state.hdr.vmx.flags |= 6455 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6456 kvm_state.hdr.vmx.preemption_timer_deadline = 6457 vmx->nested.preemption_timer_deadline; 6458 } 6459 } 6460 } 6461 6462 if (user_data_size < kvm_state.size) 6463 goto out; 6464 6465 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6466 return -EFAULT; 6467 6468 if (!vmx_has_valid_vmcs12(vcpu)) 6469 goto out; 6470 6471 /* 6472 * When running L2, the authoritative vmcs12 state is in the 6473 * vmcs02. When running L1, the authoritative vmcs12 state is 6474 * in the shadow or enlightened vmcs linked to vmcs01, unless 6475 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6476 * vmcs12 state is in the vmcs12 already. 6477 */ 6478 if (is_guest_mode(vcpu)) { 6479 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6480 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6481 } else { 6482 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6483 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6484 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 6485 /* 6486 * L1 hypervisor is not obliged to keep eVMCS 6487 * clean fields data always up-to-date while 6488 * not in guest mode, 'hv_clean_fields' is only 6489 * supposed to be actual upon vmentry so we need 6490 * to ignore it here and do full copy. 6491 */ 6492 copy_enlightened_to_vmcs12(vmx, 0); 6493 else if (enable_shadow_vmcs) 6494 copy_shadow_to_vmcs12(vmx); 6495 } 6496 } 6497 6498 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6499 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6500 6501 /* 6502 * Copy over the full allocated size of vmcs12 rather than just the size 6503 * of the struct. 6504 */ 6505 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6506 return -EFAULT; 6507 6508 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6509 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6510 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6511 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6512 return -EFAULT; 6513 } 6514 out: 6515 return kvm_state.size; 6516 } 6517 6518 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6519 { 6520 if (is_guest_mode(vcpu)) { 6521 to_vmx(vcpu)->nested.nested_run_pending = 0; 6522 nested_vmx_vmexit(vcpu, -1, 0, 0); 6523 } 6524 free_nested(vcpu); 6525 } 6526 6527 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6528 struct kvm_nested_state __user *user_kvm_nested_state, 6529 struct kvm_nested_state *kvm_state) 6530 { 6531 struct vcpu_vmx *vmx = to_vmx(vcpu); 6532 struct vmcs12 *vmcs12; 6533 enum vm_entry_failure_code ignored; 6534 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6535 &user_kvm_nested_state->data.vmx[0]; 6536 int ret; 6537 6538 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6539 return -EINVAL; 6540 6541 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6542 if (kvm_state->hdr.vmx.smm.flags) 6543 return -EINVAL; 6544 6545 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6546 return -EINVAL; 6547 6548 /* 6549 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6550 * enable eVMCS capability on vCPU. However, since then 6551 * code was changed such that flag signals vmcs12 should 6552 * be copied into eVMCS in guest memory. 6553 * 6554 * To preserve backwards compatability, allow user 6555 * to set this flag even when there is no VMXON region. 6556 */ 6557 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6558 return -EINVAL; 6559 } else { 6560 if (!nested_vmx_allowed(vcpu)) 6561 return -EINVAL; 6562 6563 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6564 return -EINVAL; 6565 } 6566 6567 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6568 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6569 return -EINVAL; 6570 6571 if (kvm_state->hdr.vmx.smm.flags & 6572 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6573 return -EINVAL; 6574 6575 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6576 return -EINVAL; 6577 6578 /* 6579 * SMM temporarily disables VMX, so we cannot be in guest mode, 6580 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6581 * must be zero. 6582 */ 6583 if (is_smm(vcpu) ? 6584 (kvm_state->flags & 6585 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6586 : kvm_state->hdr.vmx.smm.flags) 6587 return -EINVAL; 6588 6589 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6590 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6591 return -EINVAL; 6592 6593 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6594 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6595 return -EINVAL; 6596 6597 vmx_leave_nested(vcpu); 6598 6599 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6600 return 0; 6601 6602 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6603 ret = enter_vmx_operation(vcpu); 6604 if (ret) 6605 return ret; 6606 6607 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6608 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6609 /* See vmx_has_valid_vmcs12. */ 6610 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6611 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6612 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6613 return -EINVAL; 6614 else 6615 return 0; 6616 } 6617 6618 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6619 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6620 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6621 return -EINVAL; 6622 6623 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6624 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6625 /* 6626 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6627 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6628 * restored yet. EVMCS will be mapped from 6629 * nested_get_vmcs12_pages(). 6630 */ 6631 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6632 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6633 } else { 6634 return -EINVAL; 6635 } 6636 6637 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6638 vmx->nested.smm.vmxon = true; 6639 vmx->nested.vmxon = false; 6640 6641 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6642 vmx->nested.smm.guest_mode = true; 6643 } 6644 6645 vmcs12 = get_vmcs12(vcpu); 6646 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6647 return -EFAULT; 6648 6649 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6650 return -EINVAL; 6651 6652 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6653 return 0; 6654 6655 vmx->nested.nested_run_pending = 6656 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6657 6658 vmx->nested.mtf_pending = 6659 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6660 6661 ret = -EINVAL; 6662 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6663 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6664 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6665 6666 if (kvm_state->size < 6667 sizeof(*kvm_state) + 6668 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6669 goto error_guest_mode; 6670 6671 if (copy_from_user(shadow_vmcs12, 6672 user_vmx_nested_state->shadow_vmcs12, 6673 sizeof(*shadow_vmcs12))) { 6674 ret = -EFAULT; 6675 goto error_guest_mode; 6676 } 6677 6678 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6679 !shadow_vmcs12->hdr.shadow_vmcs) 6680 goto error_guest_mode; 6681 } 6682 6683 vmx->nested.has_preemption_timer_deadline = false; 6684 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6685 vmx->nested.has_preemption_timer_deadline = true; 6686 vmx->nested.preemption_timer_deadline = 6687 kvm_state->hdr.vmx.preemption_timer_deadline; 6688 } 6689 6690 if (nested_vmx_check_controls(vcpu, vmcs12) || 6691 nested_vmx_check_host_state(vcpu, vmcs12) || 6692 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6693 goto error_guest_mode; 6694 6695 vmx->nested.dirty_vmcs12 = true; 6696 vmx->nested.force_msr_bitmap_recalc = true; 6697 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6698 if (ret) 6699 goto error_guest_mode; 6700 6701 if (vmx->nested.mtf_pending) 6702 kvm_make_request(KVM_REQ_EVENT, vcpu); 6703 6704 return 0; 6705 6706 error_guest_mode: 6707 vmx->nested.nested_run_pending = 0; 6708 return ret; 6709 } 6710 6711 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6712 { 6713 if (enable_shadow_vmcs) { 6714 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6715 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6716 } 6717 } 6718 6719 /* 6720 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6721 * that madness to get the encoding for comparison. 6722 */ 6723 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6724 6725 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6726 { 6727 /* 6728 * Note these are the so called "index" of the VMCS field encoding, not 6729 * the index into vmcs12. 6730 */ 6731 unsigned int max_idx, idx; 6732 int i; 6733 6734 /* 6735 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6736 * vmcs12, regardless of whether or not the associated feature is 6737 * exposed to L1. Simply find the field with the highest index. 6738 */ 6739 max_idx = 0; 6740 for (i = 0; i < nr_vmcs12_fields; i++) { 6741 /* The vmcs12 table is very, very sparsely populated. */ 6742 if (!vmcs12_field_offsets[i]) 6743 continue; 6744 6745 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6746 if (idx > max_idx) 6747 max_idx = idx; 6748 } 6749 6750 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6751 } 6752 6753 /* 6754 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6755 * returned for the various VMX controls MSRs when nested VMX is enabled. 6756 * The same values should also be used to verify that vmcs12 control fields are 6757 * valid during nested entry from L1 to L2. 6758 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6759 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6760 * bit in the high half is on if the corresponding bit in the control field 6761 * may be on. See also vmx_control_verify(). 6762 */ 6763 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 6764 { 6765 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 6766 6767 /* 6768 * Note that as a general rule, the high half of the MSRs (bits in 6769 * the control fields which may be 1) should be initialized by the 6770 * intersection of the underlying hardware's MSR (i.e., features which 6771 * can be supported) and the list of features we want to expose - 6772 * because they are known to be properly supported in our code. 6773 * Also, usually, the low half of the MSRs (bits which must be 1) can 6774 * be set to 0, meaning that L1 may turn off any of these bits. The 6775 * reason is that if one of these bits is necessary, it will appear 6776 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6777 * fields of vmcs01 and vmcs02, will turn these bits off - and 6778 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6779 * These rules have exceptions below. 6780 */ 6781 6782 /* pin-based controls */ 6783 msrs->pinbased_ctls_low = 6784 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6785 6786 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 6787 msrs->pinbased_ctls_high &= 6788 PIN_BASED_EXT_INTR_MASK | 6789 PIN_BASED_NMI_EXITING | 6790 PIN_BASED_VIRTUAL_NMIS | 6791 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6792 msrs->pinbased_ctls_high |= 6793 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6794 PIN_BASED_VMX_PREEMPTION_TIMER; 6795 6796 /* exit controls */ 6797 msrs->exit_ctls_low = 6798 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6799 6800 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 6801 msrs->exit_ctls_high &= 6802 #ifdef CONFIG_X86_64 6803 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6804 #endif 6805 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6806 VM_EXIT_CLEAR_BNDCFGS; 6807 msrs->exit_ctls_high |= 6808 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6809 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6810 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 6811 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6812 6813 /* We support free control of debug control saving. */ 6814 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6815 6816 /* entry controls */ 6817 msrs->entry_ctls_low = 6818 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6819 6820 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 6821 msrs->entry_ctls_high &= 6822 #ifdef CONFIG_X86_64 6823 VM_ENTRY_IA32E_MODE | 6824 #endif 6825 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 6826 msrs->entry_ctls_high |= 6827 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 6828 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 6829 6830 /* We support free control of debug control loading. */ 6831 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6832 6833 /* cpu-based controls */ 6834 msrs->procbased_ctls_low = 6835 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6836 6837 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 6838 msrs->procbased_ctls_high &= 6839 CPU_BASED_INTR_WINDOW_EXITING | 6840 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6841 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6842 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6843 CPU_BASED_CR3_STORE_EXITING | 6844 #ifdef CONFIG_X86_64 6845 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6846 #endif 6847 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6848 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6849 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6850 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6851 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6852 /* 6853 * We can allow some features even when not supported by the 6854 * hardware. For example, L1 can specify an MSR bitmap - and we 6855 * can use it to avoid exits to L1 - even when L0 runs L2 6856 * without MSR bitmaps. 6857 */ 6858 msrs->procbased_ctls_high |= 6859 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6860 CPU_BASED_USE_MSR_BITMAPS; 6861 6862 /* We support free control of CR3 access interception. */ 6863 msrs->procbased_ctls_low &= 6864 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6865 6866 /* 6867 * secondary cpu-based controls. Do not include those that 6868 * depend on CPUID bits, they are added later by 6869 * vmx_vcpu_after_set_cpuid. 6870 */ 6871 msrs->secondary_ctls_low = 0; 6872 6873 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 6874 msrs->secondary_ctls_high &= 6875 SECONDARY_EXEC_DESC | 6876 SECONDARY_EXEC_ENABLE_RDTSCP | 6877 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6878 SECONDARY_EXEC_WBINVD_EXITING | 6879 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6880 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6881 SECONDARY_EXEC_RDRAND_EXITING | 6882 SECONDARY_EXEC_ENABLE_INVPCID | 6883 SECONDARY_EXEC_RDSEED_EXITING | 6884 SECONDARY_EXEC_XSAVES | 6885 SECONDARY_EXEC_TSC_SCALING | 6886 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 6887 6888 /* 6889 * We can emulate "VMCS shadowing," even if the hardware 6890 * doesn't support it. 6891 */ 6892 msrs->secondary_ctls_high |= 6893 SECONDARY_EXEC_SHADOW_VMCS; 6894 6895 if (enable_ept) { 6896 /* nested EPT: emulate EPT also to L1 */ 6897 msrs->secondary_ctls_high |= 6898 SECONDARY_EXEC_ENABLE_EPT; 6899 msrs->ept_caps = 6900 VMX_EPT_PAGE_WALK_4_BIT | 6901 VMX_EPT_PAGE_WALK_5_BIT | 6902 VMX_EPTP_WB_BIT | 6903 VMX_EPT_INVEPT_BIT | 6904 VMX_EPT_EXECUTE_ONLY_BIT; 6905 6906 msrs->ept_caps &= ept_caps; 6907 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6908 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6909 VMX_EPT_1GB_PAGE_BIT; 6910 if (enable_ept_ad_bits) { 6911 msrs->secondary_ctls_high |= 6912 SECONDARY_EXEC_ENABLE_PML; 6913 msrs->ept_caps |= VMX_EPT_AD_BIT; 6914 } 6915 } 6916 6917 if (cpu_has_vmx_vmfunc()) { 6918 msrs->secondary_ctls_high |= 6919 SECONDARY_EXEC_ENABLE_VMFUNC; 6920 /* 6921 * Advertise EPTP switching unconditionally 6922 * since we emulate it 6923 */ 6924 if (enable_ept) 6925 msrs->vmfunc_controls = 6926 VMX_VMFUNC_EPTP_SWITCHING; 6927 } 6928 6929 /* 6930 * Old versions of KVM use the single-context version without 6931 * checking for support, so declare that it is supported even 6932 * though it is treated as global context. The alternative is 6933 * not failing the single-context invvpid, and it is worse. 6934 */ 6935 if (enable_vpid) { 6936 msrs->secondary_ctls_high |= 6937 SECONDARY_EXEC_ENABLE_VPID; 6938 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6939 VMX_VPID_EXTENT_SUPPORTED_MASK; 6940 } 6941 6942 if (enable_unrestricted_guest) 6943 msrs->secondary_ctls_high |= 6944 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6945 6946 if (flexpriority_enabled) 6947 msrs->secondary_ctls_high |= 6948 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6949 6950 if (enable_sgx) 6951 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6952 6953 /* miscellaneous data */ 6954 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 6955 msrs->misc_low |= 6956 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6957 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6958 VMX_MISC_ACTIVITY_HLT | 6959 VMX_MISC_ACTIVITY_WAIT_SIPI; 6960 msrs->misc_high = 0; 6961 6962 /* 6963 * This MSR reports some information about VMX support. We 6964 * should return information about the VMX we emulate for the 6965 * guest, and the VMCS structure we give it - not about the 6966 * VMX support of the underlying hardware. 6967 */ 6968 msrs->basic = 6969 VMCS12_REVISION | 6970 VMX_BASIC_TRUE_CTLS | 6971 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6972 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6973 6974 if (cpu_has_vmx_basic_inout()) 6975 msrs->basic |= VMX_BASIC_INOUT; 6976 6977 /* 6978 * These MSRs specify bits which the guest must keep fixed on 6979 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6980 * We picked the standard core2 setting. 6981 */ 6982 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6983 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6984 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6985 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6986 6987 /* These MSRs specify bits which the guest must keep fixed off. */ 6988 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6989 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6990 6991 if (vmx_umip_emulated()) 6992 msrs->cr4_fixed1 |= X86_CR4_UMIP; 6993 6994 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 6995 } 6996 6997 void nested_vmx_hardware_unsetup(void) 6998 { 6999 int i; 7000 7001 if (enable_shadow_vmcs) { 7002 for (i = 0; i < VMX_BITMAP_NR; i++) 7003 free_page((unsigned long)vmx_bitmap[i]); 7004 } 7005 } 7006 7007 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7008 { 7009 int i; 7010 7011 if (!cpu_has_vmx_shadow_vmcs()) 7012 enable_shadow_vmcs = 0; 7013 if (enable_shadow_vmcs) { 7014 for (i = 0; i < VMX_BITMAP_NR; i++) { 7015 /* 7016 * The vmx_bitmap is not tied to a VM and so should 7017 * not be charged to a memcg. 7018 */ 7019 vmx_bitmap[i] = (unsigned long *) 7020 __get_free_page(GFP_KERNEL); 7021 if (!vmx_bitmap[i]) { 7022 nested_vmx_hardware_unsetup(); 7023 return -ENOMEM; 7024 } 7025 } 7026 7027 init_vmcs_shadow_fields(); 7028 } 7029 7030 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7031 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7032 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7033 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7034 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7035 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7036 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7037 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7038 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7039 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7040 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7041 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7042 7043 return 0; 7044 } 7045 7046 struct kvm_x86_nested_ops vmx_nested_ops = { 7047 .leave_nested = vmx_leave_nested, 7048 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7049 .check_events = vmx_check_nested_events, 7050 .has_events = vmx_has_nested_events, 7051 .triple_fault = nested_vmx_triple_fault, 7052 .get_state = vmx_get_nested_state, 7053 .set_state = vmx_set_nested_state, 7054 .get_nested_state_pages = vmx_get_nested_state_pages, 7055 .write_log_dirty = nested_vmx_write_pml_buffer, 7056 .enable_evmcs = nested_enable_evmcs, 7057 .get_evmcs_version = nested_get_evmcs_version, 7058 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7059 }; 7060