1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/objtool.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "evmcs.h" 11 #include "hyperv.h" 12 #include "mmu.h" 13 #include "nested.h" 14 #include "pmu.h" 15 #include "sgx.h" 16 #include "trace.h" 17 #include "vmx.h" 18 #include "x86.h" 19 20 static bool __read_mostly enable_shadow_vmcs = 1; 21 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 22 23 static bool __read_mostly nested_early_check = 0; 24 module_param(nested_early_check, bool, S_IRUGO); 25 26 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 27 28 /* 29 * Hyper-V requires all of these, so mark them as supported even though 30 * they are just treated the same as all-context. 31 */ 32 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 33 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 34 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 35 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 36 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 37 38 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 39 40 enum { 41 VMX_VMREAD_BITMAP, 42 VMX_VMWRITE_BITMAP, 43 VMX_BITMAP_NR 44 }; 45 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 46 47 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 48 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 49 50 struct shadow_vmcs_field { 51 u16 encoding; 52 u16 offset; 53 }; 54 static struct shadow_vmcs_field shadow_read_only_fields[] = { 55 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 56 #include "vmcs_shadow_fields.h" 57 }; 58 static int max_shadow_read_only_fields = 59 ARRAY_SIZE(shadow_read_only_fields); 60 61 static struct shadow_vmcs_field shadow_read_write_fields[] = { 62 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 63 #include "vmcs_shadow_fields.h" 64 }; 65 static int max_shadow_read_write_fields = 66 ARRAY_SIZE(shadow_read_write_fields); 67 68 static void init_vmcs_shadow_fields(void) 69 { 70 int i, j; 71 72 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 73 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 74 75 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 76 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 77 u16 field = entry.encoding; 78 79 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 80 (i + 1 == max_shadow_read_only_fields || 81 shadow_read_only_fields[i + 1].encoding != field + 1)) 82 pr_err("Missing field from shadow_read_only_field %x\n", 83 field + 1); 84 85 clear_bit(field, vmx_vmread_bitmap); 86 if (field & 1) 87 #ifdef CONFIG_X86_64 88 continue; 89 #else 90 entry.offset += sizeof(u32); 91 #endif 92 shadow_read_only_fields[j++] = entry; 93 } 94 max_shadow_read_only_fields = j; 95 96 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 97 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 98 u16 field = entry.encoding; 99 100 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 101 (i + 1 == max_shadow_read_write_fields || 102 shadow_read_write_fields[i + 1].encoding != field + 1)) 103 pr_err("Missing field from shadow_read_write_field %x\n", 104 field + 1); 105 106 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 107 field <= GUEST_TR_AR_BYTES, 108 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 109 110 /* 111 * PML and the preemption timer can be emulated, but the 112 * processor cannot vmwrite to fields that don't exist 113 * on bare metal. 114 */ 115 switch (field) { 116 case GUEST_PML_INDEX: 117 if (!cpu_has_vmx_pml()) 118 continue; 119 break; 120 case VMX_PREEMPTION_TIMER_VALUE: 121 if (!cpu_has_vmx_preemption_timer()) 122 continue; 123 break; 124 case GUEST_INTR_STATUS: 125 if (!cpu_has_vmx_apicv()) 126 continue; 127 break; 128 default: 129 break; 130 } 131 132 clear_bit(field, vmx_vmwrite_bitmap); 133 clear_bit(field, vmx_vmread_bitmap); 134 if (field & 1) 135 #ifdef CONFIG_X86_64 136 continue; 137 #else 138 entry.offset += sizeof(u32); 139 #endif 140 shadow_read_write_fields[j++] = entry; 141 } 142 max_shadow_read_write_fields = j; 143 } 144 145 /* 146 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 147 * set the success or error code of an emulated VMX instruction (as specified 148 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 149 * instruction. 150 */ 151 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 152 { 153 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 154 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 155 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 156 return kvm_skip_emulated_instruction(vcpu); 157 } 158 159 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 160 { 161 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 162 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 163 X86_EFLAGS_SF | X86_EFLAGS_OF)) 164 | X86_EFLAGS_CF); 165 return kvm_skip_emulated_instruction(vcpu); 166 } 167 168 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 169 u32 vm_instruction_error) 170 { 171 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 172 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 173 X86_EFLAGS_SF | X86_EFLAGS_OF)) 174 | X86_EFLAGS_ZF); 175 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 176 /* 177 * We don't need to force sync to shadow VMCS because 178 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 179 * fields and thus must be synced. 180 */ 181 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 182 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 183 184 return kvm_skip_emulated_instruction(vcpu); 185 } 186 187 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 188 { 189 struct vcpu_vmx *vmx = to_vmx(vcpu); 190 191 /* 192 * failValid writes the error number to the current VMCS, which 193 * can't be done if there isn't a current VMCS. 194 */ 195 if (vmx->nested.current_vmptr == INVALID_GPA && 196 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 197 return nested_vmx_failInvalid(vcpu); 198 199 return nested_vmx_failValid(vcpu, vm_instruction_error); 200 } 201 202 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 203 { 204 /* TODO: not to reset guest simply here. */ 205 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 206 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 207 } 208 209 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 210 { 211 return fixed_bits_valid(control, low, high); 212 } 213 214 static inline u64 vmx_control_msr(u32 low, u32 high) 215 { 216 return low | ((u64)high << 32); 217 } 218 219 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 220 { 221 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 222 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 223 vmx->nested.need_vmcs12_to_shadow_sync = false; 224 } 225 226 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 227 { 228 struct vcpu_vmx *vmx = to_vmx(vcpu); 229 230 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 231 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 232 vmx->nested.hv_evmcs = NULL; 233 } 234 235 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 236 } 237 238 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 239 struct loaded_vmcs *prev) 240 { 241 struct vmcs_host_state *dest, *src; 242 243 if (unlikely(!vmx->guest_state_loaded)) 244 return; 245 246 src = &prev->host_state; 247 dest = &vmx->loaded_vmcs->host_state; 248 249 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 250 dest->ldt_sel = src->ldt_sel; 251 #ifdef CONFIG_X86_64 252 dest->ds_sel = src->ds_sel; 253 dest->es_sel = src->es_sel; 254 #endif 255 } 256 257 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 258 { 259 struct vcpu_vmx *vmx = to_vmx(vcpu); 260 struct loaded_vmcs *prev; 261 int cpu; 262 263 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 264 return; 265 266 cpu = get_cpu(); 267 prev = vmx->loaded_vmcs; 268 vmx->loaded_vmcs = vmcs; 269 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 270 vmx_sync_vmcs_host_state(vmx, prev); 271 put_cpu(); 272 273 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 274 275 /* 276 * All lazily updated registers will be reloaded from VMCS12 on both 277 * vmentry and vmexit. 278 */ 279 vcpu->arch.regs_dirty = 0; 280 } 281 282 /* 283 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 284 * just stops using VMX. 285 */ 286 static void free_nested(struct kvm_vcpu *vcpu) 287 { 288 struct vcpu_vmx *vmx = to_vmx(vcpu); 289 290 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 291 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 292 293 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 294 return; 295 296 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 297 298 vmx->nested.vmxon = false; 299 vmx->nested.smm.vmxon = false; 300 vmx->nested.vmxon_ptr = INVALID_GPA; 301 free_vpid(vmx->nested.vpid02); 302 vmx->nested.posted_intr_nv = -1; 303 vmx->nested.current_vmptr = INVALID_GPA; 304 if (enable_shadow_vmcs) { 305 vmx_disable_shadow_vmcs(vmx); 306 vmcs_clear(vmx->vmcs01.shadow_vmcs); 307 free_vmcs(vmx->vmcs01.shadow_vmcs); 308 vmx->vmcs01.shadow_vmcs = NULL; 309 } 310 kfree(vmx->nested.cached_vmcs12); 311 vmx->nested.cached_vmcs12 = NULL; 312 kfree(vmx->nested.cached_shadow_vmcs12); 313 vmx->nested.cached_shadow_vmcs12 = NULL; 314 /* 315 * Unpin physical memory we referred to in the vmcs02. The APIC access 316 * page's backing page (yeah, confusing) shouldn't actually be accessed, 317 * and if it is written, the contents are irrelevant. 318 */ 319 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 320 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 321 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 322 vmx->nested.pi_desc = NULL; 323 324 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 325 326 nested_release_evmcs(vcpu); 327 328 free_loaded_vmcs(&vmx->nested.vmcs02); 329 } 330 331 /* 332 * Ensure that the current vmcs of the logical processor is the 333 * vmcs01 of the vcpu before calling free_nested(). 334 */ 335 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 336 { 337 vcpu_load(vcpu); 338 vmx_leave_nested(vcpu); 339 vcpu_put(vcpu); 340 } 341 342 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 343 344 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 345 { 346 return VALID_PAGE(root_hpa) && 347 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 348 } 349 350 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 351 gpa_t addr) 352 { 353 uint i; 354 struct kvm_mmu_root_info *cached_root; 355 356 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 357 358 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 359 cached_root = &vcpu->arch.mmu->prev_roots[i]; 360 361 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 362 eptp)) 363 vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); 364 } 365 } 366 367 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 368 struct x86_exception *fault) 369 { 370 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 371 struct vcpu_vmx *vmx = to_vmx(vcpu); 372 u32 vm_exit_reason; 373 unsigned long exit_qualification = vcpu->arch.exit_qualification; 374 375 if (vmx->nested.pml_full) { 376 vm_exit_reason = EXIT_REASON_PML_FULL; 377 vmx->nested.pml_full = false; 378 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 379 } else { 380 if (fault->error_code & PFERR_RSVD_MASK) 381 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 382 else 383 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 384 385 /* 386 * Although the caller (kvm_inject_emulated_page_fault) would 387 * have already synced the faulting address in the shadow EPT 388 * tables for the current EPTP12, we also need to sync it for 389 * any other cached EPTP02s based on the same EP4TA, since the 390 * TLB associates mappings to the EP4TA rather than the full EPTP. 391 */ 392 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 393 fault->address); 394 } 395 396 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 397 vmcs12->guest_physical_address = fault->address; 398 } 399 400 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 401 { 402 struct vcpu_vmx *vmx = to_vmx(vcpu); 403 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 404 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 405 406 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 407 nested_ept_ad_enabled(vcpu), 408 nested_ept_get_eptp(vcpu)); 409 } 410 411 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 412 { 413 WARN_ON(mmu_is_nested(vcpu)); 414 415 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 416 nested_ept_new_eptp(vcpu); 417 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 418 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 419 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 420 421 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 422 } 423 424 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 425 { 426 vcpu->arch.mmu = &vcpu->arch.root_mmu; 427 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 428 } 429 430 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 431 u16 error_code) 432 { 433 bool inequality, bit; 434 435 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 436 inequality = 437 (error_code & vmcs12->page_fault_error_code_mask) != 438 vmcs12->page_fault_error_code_match; 439 return inequality ^ bit; 440 } 441 442 443 /* 444 * KVM wants to inject page-faults which it got to the guest. This function 445 * checks whether in a nested guest, we need to inject them to L1 or L2. 446 */ 447 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 448 { 449 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 450 unsigned int nr = vcpu->arch.exception.nr; 451 bool has_payload = vcpu->arch.exception.has_payload; 452 unsigned long payload = vcpu->arch.exception.payload; 453 454 if (nr == PF_VECTOR) { 455 if (vcpu->arch.exception.nested_apf) { 456 *exit_qual = vcpu->arch.apf.nested_apf_token; 457 return 1; 458 } 459 if (nested_vmx_is_page_fault_vmexit(vmcs12, 460 vcpu->arch.exception.error_code)) { 461 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 462 return 1; 463 } 464 } else if (vmcs12->exception_bitmap & (1u << nr)) { 465 if (nr == DB_VECTOR) { 466 if (!has_payload) { 467 payload = vcpu->arch.dr6; 468 payload &= ~DR6_BT; 469 payload ^= DR6_ACTIVE_LOW; 470 } 471 *exit_qual = payload; 472 } else 473 *exit_qual = 0; 474 return 1; 475 } 476 477 return 0; 478 } 479 480 static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu, 481 struct x86_exception *fault) 482 { 483 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 484 485 WARN_ON(!is_guest_mode(vcpu)); 486 487 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 488 !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) { 489 vmcs12->vm_exit_intr_error_code = fault->error_code; 490 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 491 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 492 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 493 fault->address); 494 return true; 495 } 496 return false; 497 } 498 499 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 500 struct vmcs12 *vmcs12) 501 { 502 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 503 return 0; 504 505 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 506 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 507 return -EINVAL; 508 509 return 0; 510 } 511 512 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 513 struct vmcs12 *vmcs12) 514 { 515 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 516 return 0; 517 518 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 519 return -EINVAL; 520 521 return 0; 522 } 523 524 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 525 struct vmcs12 *vmcs12) 526 { 527 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 528 return 0; 529 530 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 531 return -EINVAL; 532 533 return 0; 534 } 535 536 /* 537 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 538 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 539 * only the "disable intercept" case needs to be handled. 540 */ 541 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 542 unsigned long *msr_bitmap_l0, 543 u32 msr, int type) 544 { 545 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 546 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 547 548 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 549 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 550 } 551 552 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 553 { 554 int msr; 555 556 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 557 unsigned word = msr / BITS_PER_LONG; 558 559 msr_bitmap[word] = ~0; 560 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 561 } 562 } 563 564 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 565 static inline \ 566 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 567 unsigned long *msr_bitmap_l1, \ 568 unsigned long *msr_bitmap_l0, u32 msr) \ 569 { \ 570 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 571 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 572 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 573 else \ 574 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 575 } 576 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 577 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 578 579 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 580 unsigned long *msr_bitmap_l1, 581 unsigned long *msr_bitmap_l0, 582 u32 msr, int types) 583 { 584 if (types & MSR_TYPE_R) 585 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 586 msr_bitmap_l0, msr); 587 if (types & MSR_TYPE_W) 588 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 589 msr_bitmap_l0, msr); 590 } 591 592 /* 593 * Merge L0's and L1's MSR bitmap, return false to indicate that 594 * we do not use the hardware. 595 */ 596 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 597 struct vmcs12 *vmcs12) 598 { 599 struct vcpu_vmx *vmx = to_vmx(vcpu); 600 int msr; 601 unsigned long *msr_bitmap_l1; 602 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 603 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 604 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 605 606 /* Nothing to do if the MSR bitmap is not in use. */ 607 if (!cpu_has_vmx_msr_bitmap() || 608 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 609 return false; 610 611 /* 612 * MSR bitmap update can be skipped when: 613 * - MSR bitmap for L1 hasn't changed. 614 * - Nested hypervisor (L1) is attempting to launch the same L2 as 615 * before. 616 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 617 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 618 */ 619 if (!vmx->nested.force_msr_bitmap_recalc && evmcs && 620 evmcs->hv_enlightenments_control.msr_bitmap && 621 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 622 return true; 623 624 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 625 return false; 626 627 msr_bitmap_l1 = (unsigned long *)map->hva; 628 629 /* 630 * To keep the control flow simple, pay eight 8-byte writes (sixteen 631 * 4-byte writes on 32-bit systems) up front to enable intercepts for 632 * the x2APIC MSR range and selectively toggle those relevant to L2. 633 */ 634 enable_x2apic_msr_intercepts(msr_bitmap_l0); 635 636 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 637 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 638 /* 639 * L0 need not intercept reads for MSRs between 0x800 640 * and 0x8ff, it just lets the processor take the value 641 * from the virtual-APIC page; take those 256 bits 642 * directly from the L1 bitmap. 643 */ 644 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 645 unsigned word = msr / BITS_PER_LONG; 646 647 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 648 } 649 } 650 651 nested_vmx_disable_intercept_for_x2apic_msr( 652 msr_bitmap_l1, msr_bitmap_l0, 653 X2APIC_MSR(APIC_TASKPRI), 654 MSR_TYPE_R | MSR_TYPE_W); 655 656 if (nested_cpu_has_vid(vmcs12)) { 657 nested_vmx_disable_intercept_for_x2apic_msr( 658 msr_bitmap_l1, msr_bitmap_l0, 659 X2APIC_MSR(APIC_EOI), 660 MSR_TYPE_W); 661 nested_vmx_disable_intercept_for_x2apic_msr( 662 msr_bitmap_l1, msr_bitmap_l0, 663 X2APIC_MSR(APIC_SELF_IPI), 664 MSR_TYPE_W); 665 } 666 } 667 668 /* 669 * Always check vmcs01's bitmap to honor userspace MSR filters and any 670 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 671 */ 672 #ifdef CONFIG_X86_64 673 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 674 MSR_FS_BASE, MSR_TYPE_RW); 675 676 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 677 MSR_GS_BASE, MSR_TYPE_RW); 678 679 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 680 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 681 #endif 682 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 683 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 684 685 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 686 MSR_IA32_PRED_CMD, MSR_TYPE_W); 687 688 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 689 690 vmx->nested.force_msr_bitmap_recalc = false; 691 692 return true; 693 } 694 695 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 696 struct vmcs12 *vmcs12) 697 { 698 struct vcpu_vmx *vmx = to_vmx(vcpu); 699 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 700 701 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 702 vmcs12->vmcs_link_pointer == INVALID_GPA) 703 return; 704 705 if (ghc->gpa != vmcs12->vmcs_link_pointer && 706 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 707 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 708 return; 709 710 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 711 VMCS12_SIZE); 712 } 713 714 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 715 struct vmcs12 *vmcs12) 716 { 717 struct vcpu_vmx *vmx = to_vmx(vcpu); 718 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 719 720 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 721 vmcs12->vmcs_link_pointer == INVALID_GPA) 722 return; 723 724 if (ghc->gpa != vmcs12->vmcs_link_pointer && 725 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 726 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 727 return; 728 729 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 730 VMCS12_SIZE); 731 } 732 733 /* 734 * In nested virtualization, check if L1 has set 735 * VM_EXIT_ACK_INTR_ON_EXIT 736 */ 737 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 738 { 739 return get_vmcs12(vcpu)->vm_exit_controls & 740 VM_EXIT_ACK_INTR_ON_EXIT; 741 } 742 743 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 744 struct vmcs12 *vmcs12) 745 { 746 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 747 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 748 return -EINVAL; 749 else 750 return 0; 751 } 752 753 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 754 struct vmcs12 *vmcs12) 755 { 756 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 757 !nested_cpu_has_apic_reg_virt(vmcs12) && 758 !nested_cpu_has_vid(vmcs12) && 759 !nested_cpu_has_posted_intr(vmcs12)) 760 return 0; 761 762 /* 763 * If virtualize x2apic mode is enabled, 764 * virtualize apic access must be disabled. 765 */ 766 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 767 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 768 return -EINVAL; 769 770 /* 771 * If virtual interrupt delivery is enabled, 772 * we must exit on external interrupts. 773 */ 774 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 775 return -EINVAL; 776 777 /* 778 * bits 15:8 should be zero in posted_intr_nv, 779 * the descriptor address has been already checked 780 * in nested_get_vmcs12_pages. 781 * 782 * bits 5:0 of posted_intr_desc_addr should be zero. 783 */ 784 if (nested_cpu_has_posted_intr(vmcs12) && 785 (CC(!nested_cpu_has_vid(vmcs12)) || 786 CC(!nested_exit_intr_ack_set(vcpu)) || 787 CC((vmcs12->posted_intr_nv & 0xff00)) || 788 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 789 return -EINVAL; 790 791 /* tpr shadow is needed by all apicv features. */ 792 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 793 return -EINVAL; 794 795 return 0; 796 } 797 798 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 799 u32 count, u64 addr) 800 { 801 if (count == 0) 802 return 0; 803 804 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 805 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 806 return -EINVAL; 807 808 return 0; 809 } 810 811 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 812 struct vmcs12 *vmcs12) 813 { 814 if (CC(nested_vmx_check_msr_switch(vcpu, 815 vmcs12->vm_exit_msr_load_count, 816 vmcs12->vm_exit_msr_load_addr)) || 817 CC(nested_vmx_check_msr_switch(vcpu, 818 vmcs12->vm_exit_msr_store_count, 819 vmcs12->vm_exit_msr_store_addr))) 820 return -EINVAL; 821 822 return 0; 823 } 824 825 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 826 struct vmcs12 *vmcs12) 827 { 828 if (CC(nested_vmx_check_msr_switch(vcpu, 829 vmcs12->vm_entry_msr_load_count, 830 vmcs12->vm_entry_msr_load_addr))) 831 return -EINVAL; 832 833 return 0; 834 } 835 836 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 837 struct vmcs12 *vmcs12) 838 { 839 if (!nested_cpu_has_pml(vmcs12)) 840 return 0; 841 842 if (CC(!nested_cpu_has_ept(vmcs12)) || 843 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 844 return -EINVAL; 845 846 return 0; 847 } 848 849 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 850 struct vmcs12 *vmcs12) 851 { 852 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 853 !nested_cpu_has_ept(vmcs12))) 854 return -EINVAL; 855 return 0; 856 } 857 858 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 859 struct vmcs12 *vmcs12) 860 { 861 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 862 !nested_cpu_has_ept(vmcs12))) 863 return -EINVAL; 864 return 0; 865 } 866 867 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 868 struct vmcs12 *vmcs12) 869 { 870 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 871 return 0; 872 873 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 874 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 875 return -EINVAL; 876 877 return 0; 878 } 879 880 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 881 struct vmx_msr_entry *e) 882 { 883 /* x2APIC MSR accesses are not allowed */ 884 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 885 return -EINVAL; 886 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 887 CC(e->index == MSR_IA32_UCODE_REV)) 888 return -EINVAL; 889 if (CC(e->reserved != 0)) 890 return -EINVAL; 891 return 0; 892 } 893 894 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 895 struct vmx_msr_entry *e) 896 { 897 if (CC(e->index == MSR_FS_BASE) || 898 CC(e->index == MSR_GS_BASE) || 899 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 900 nested_vmx_msr_check_common(vcpu, e)) 901 return -EINVAL; 902 return 0; 903 } 904 905 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 906 struct vmx_msr_entry *e) 907 { 908 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 909 nested_vmx_msr_check_common(vcpu, e)) 910 return -EINVAL; 911 return 0; 912 } 913 914 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 915 { 916 struct vcpu_vmx *vmx = to_vmx(vcpu); 917 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 918 vmx->nested.msrs.misc_high); 919 920 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 921 } 922 923 /* 924 * Load guest's/host's msr at nested entry/exit. 925 * return 0 for success, entry index for failure. 926 * 927 * One of the failure modes for MSR load/store is when a list exceeds the 928 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 929 * as possible, process all valid entries before failing rather than precheck 930 * for a capacity violation. 931 */ 932 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 933 { 934 u32 i; 935 struct vmx_msr_entry e; 936 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 937 938 for (i = 0; i < count; i++) { 939 if (unlikely(i >= max_msr_list_size)) 940 goto fail; 941 942 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 943 &e, sizeof(e))) { 944 pr_debug_ratelimited( 945 "%s cannot read MSR entry (%u, 0x%08llx)\n", 946 __func__, i, gpa + i * sizeof(e)); 947 goto fail; 948 } 949 if (nested_vmx_load_msr_check(vcpu, &e)) { 950 pr_debug_ratelimited( 951 "%s check failed (%u, 0x%x, 0x%x)\n", 952 __func__, i, e.index, e.reserved); 953 goto fail; 954 } 955 if (kvm_set_msr(vcpu, e.index, e.value)) { 956 pr_debug_ratelimited( 957 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 958 __func__, i, e.index, e.value); 959 goto fail; 960 } 961 } 962 return 0; 963 fail: 964 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 965 return i + 1; 966 } 967 968 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 969 u32 msr_index, 970 u64 *data) 971 { 972 struct vcpu_vmx *vmx = to_vmx(vcpu); 973 974 /* 975 * If the L0 hypervisor stored a more accurate value for the TSC that 976 * does not include the time taken for emulation of the L2->L1 977 * VM-exit in L0, use the more accurate value. 978 */ 979 if (msr_index == MSR_IA32_TSC) { 980 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 981 MSR_IA32_TSC); 982 983 if (i >= 0) { 984 u64 val = vmx->msr_autostore.guest.val[i].value; 985 986 *data = kvm_read_l1_tsc(vcpu, val); 987 return true; 988 } 989 } 990 991 if (kvm_get_msr(vcpu, msr_index, data)) { 992 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 993 msr_index); 994 return false; 995 } 996 return true; 997 } 998 999 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1000 struct vmx_msr_entry *e) 1001 { 1002 if (kvm_vcpu_read_guest(vcpu, 1003 gpa + i * sizeof(*e), 1004 e, 2 * sizeof(u32))) { 1005 pr_debug_ratelimited( 1006 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1007 __func__, i, gpa + i * sizeof(*e)); 1008 return false; 1009 } 1010 if (nested_vmx_store_msr_check(vcpu, e)) { 1011 pr_debug_ratelimited( 1012 "%s check failed (%u, 0x%x, 0x%x)\n", 1013 __func__, i, e->index, e->reserved); 1014 return false; 1015 } 1016 return true; 1017 } 1018 1019 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1020 { 1021 u64 data; 1022 u32 i; 1023 struct vmx_msr_entry e; 1024 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1025 1026 for (i = 0; i < count; i++) { 1027 if (unlikely(i >= max_msr_list_size)) 1028 return -EINVAL; 1029 1030 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1031 return -EINVAL; 1032 1033 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1034 return -EINVAL; 1035 1036 if (kvm_vcpu_write_guest(vcpu, 1037 gpa + i * sizeof(e) + 1038 offsetof(struct vmx_msr_entry, value), 1039 &data, sizeof(data))) { 1040 pr_debug_ratelimited( 1041 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1042 __func__, i, e.index, data); 1043 return -EINVAL; 1044 } 1045 } 1046 return 0; 1047 } 1048 1049 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1050 { 1051 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1052 u32 count = vmcs12->vm_exit_msr_store_count; 1053 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1054 struct vmx_msr_entry e; 1055 u32 i; 1056 1057 for (i = 0; i < count; i++) { 1058 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1059 return false; 1060 1061 if (e.index == msr_index) 1062 return true; 1063 } 1064 return false; 1065 } 1066 1067 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1068 u32 msr_index) 1069 { 1070 struct vcpu_vmx *vmx = to_vmx(vcpu); 1071 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1072 bool in_vmcs12_store_list; 1073 int msr_autostore_slot; 1074 bool in_autostore_list; 1075 int last; 1076 1077 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1078 in_autostore_list = msr_autostore_slot >= 0; 1079 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1080 1081 if (in_vmcs12_store_list && !in_autostore_list) { 1082 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1083 /* 1084 * Emulated VMEntry does not fail here. Instead a less 1085 * accurate value will be returned by 1086 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1087 * instead of reading the value from the vmcs02 VMExit 1088 * MSR-store area. 1089 */ 1090 pr_warn_ratelimited( 1091 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1092 msr_index); 1093 return; 1094 } 1095 last = autostore->nr++; 1096 autostore->val[last].index = msr_index; 1097 } else if (!in_vmcs12_store_list && in_autostore_list) { 1098 last = --autostore->nr; 1099 autostore->val[msr_autostore_slot] = autostore->val[last]; 1100 } 1101 } 1102 1103 /* 1104 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1105 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1106 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1107 * @entry_failure_code. 1108 */ 1109 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1110 bool nested_ept, bool reload_pdptrs, 1111 enum vm_entry_failure_code *entry_failure_code) 1112 { 1113 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1114 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1115 return -EINVAL; 1116 } 1117 1118 /* 1119 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1120 * must not be dereferenced. 1121 */ 1122 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1123 CC(!load_pdptrs(vcpu, cr3))) { 1124 *entry_failure_code = ENTRY_FAIL_PDPTE; 1125 return -EINVAL; 1126 } 1127 1128 vcpu->arch.cr3 = cr3; 1129 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1130 1131 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1132 kvm_init_mmu(vcpu); 1133 1134 if (!nested_ept) 1135 kvm_mmu_new_pgd(vcpu, cr3); 1136 1137 return 0; 1138 } 1139 1140 /* 1141 * Returns if KVM is able to config CPU to tag TLB entries 1142 * populated by L2 differently than TLB entries populated 1143 * by L1. 1144 * 1145 * If L0 uses EPT, L1 and L2 run with different EPTP because 1146 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1147 * are tagged with different EPTP. 1148 * 1149 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1150 * with different VPID (L1 entries are tagged with vmx->vpid 1151 * while L2 entries are tagged with vmx->nested.vpid02). 1152 */ 1153 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1154 { 1155 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1156 1157 return enable_ept || 1158 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1159 } 1160 1161 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1162 struct vmcs12 *vmcs12, 1163 bool is_vmenter) 1164 { 1165 struct vcpu_vmx *vmx = to_vmx(vcpu); 1166 1167 /* 1168 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1169 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1170 * full TLB flush from the guest's perspective. This is required even 1171 * if VPID is disabled in the host as KVM may need to synchronize the 1172 * MMU in response to the guest TLB flush. 1173 * 1174 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1175 * EPT is a special snowflake, as guest-physical mappings aren't 1176 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1177 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1178 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1179 * those mappings. 1180 */ 1181 if (!nested_cpu_has_vpid(vmcs12)) { 1182 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1183 return; 1184 } 1185 1186 /* L2 should never have a VPID if VPID is disabled. */ 1187 WARN_ON(!enable_vpid); 1188 1189 /* 1190 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1191 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1192 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1193 * that the new vpid12 has never been used and thus represents a new 1194 * guest ASID that cannot have entries in the TLB. 1195 */ 1196 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1197 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1198 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1199 return; 1200 } 1201 1202 /* 1203 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1204 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1205 * KVM was unable to allocate a VPID for L2, flush the current context 1206 * as the effective ASID is common to both L1 and L2. 1207 */ 1208 if (!nested_has_guest_tlb_tag(vcpu)) 1209 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1210 } 1211 1212 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1213 { 1214 superset &= mask; 1215 subset &= mask; 1216 1217 return (superset | subset) == superset; 1218 } 1219 1220 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1221 { 1222 const u64 feature_and_reserved = 1223 /* feature (except bit 48; see below) */ 1224 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1225 /* reserved */ 1226 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1227 u64 vmx_basic = vmcs_config.nested.basic; 1228 1229 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1230 return -EINVAL; 1231 1232 /* 1233 * KVM does not emulate a version of VMX that constrains physical 1234 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1235 */ 1236 if (data & BIT_ULL(48)) 1237 return -EINVAL; 1238 1239 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1240 vmx_basic_vmcs_revision_id(data)) 1241 return -EINVAL; 1242 1243 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1244 return -EINVAL; 1245 1246 vmx->nested.msrs.basic = data; 1247 return 0; 1248 } 1249 1250 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1251 u32 **low, u32 **high) 1252 { 1253 switch (msr_index) { 1254 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1255 *low = &msrs->pinbased_ctls_low; 1256 *high = &msrs->pinbased_ctls_high; 1257 break; 1258 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1259 *low = &msrs->procbased_ctls_low; 1260 *high = &msrs->procbased_ctls_high; 1261 break; 1262 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1263 *low = &msrs->exit_ctls_low; 1264 *high = &msrs->exit_ctls_high; 1265 break; 1266 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1267 *low = &msrs->entry_ctls_low; 1268 *high = &msrs->entry_ctls_high; 1269 break; 1270 case MSR_IA32_VMX_PROCBASED_CTLS2: 1271 *low = &msrs->secondary_ctls_low; 1272 *high = &msrs->secondary_ctls_high; 1273 break; 1274 default: 1275 BUG(); 1276 } 1277 } 1278 1279 static int 1280 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1281 { 1282 u32 *lowp, *highp; 1283 u64 supported; 1284 1285 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1286 1287 supported = vmx_control_msr(*lowp, *highp); 1288 1289 /* Check must-be-1 bits are still 1. */ 1290 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1291 return -EINVAL; 1292 1293 /* Check must-be-0 bits are still 0. */ 1294 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1295 return -EINVAL; 1296 1297 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1298 *lowp = data; 1299 *highp = data >> 32; 1300 return 0; 1301 } 1302 1303 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1304 { 1305 const u64 feature_and_reserved_bits = 1306 /* feature */ 1307 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1308 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1309 /* reserved */ 1310 GENMASK_ULL(13, 9) | BIT_ULL(31); 1311 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1312 vmcs_config.nested.misc_high); 1313 1314 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1315 return -EINVAL; 1316 1317 if ((vmx->nested.msrs.pinbased_ctls_high & 1318 PIN_BASED_VMX_PREEMPTION_TIMER) && 1319 vmx_misc_preemption_timer_rate(data) != 1320 vmx_misc_preemption_timer_rate(vmx_misc)) 1321 return -EINVAL; 1322 1323 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1324 return -EINVAL; 1325 1326 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1327 return -EINVAL; 1328 1329 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1330 return -EINVAL; 1331 1332 vmx->nested.msrs.misc_low = data; 1333 vmx->nested.msrs.misc_high = data >> 32; 1334 1335 return 0; 1336 } 1337 1338 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1339 { 1340 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1341 vmcs_config.nested.vpid_caps); 1342 1343 /* Every bit is either reserved or a feature bit. */ 1344 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1345 return -EINVAL; 1346 1347 vmx->nested.msrs.ept_caps = data; 1348 vmx->nested.msrs.vpid_caps = data >> 32; 1349 return 0; 1350 } 1351 1352 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1353 { 1354 switch (msr_index) { 1355 case MSR_IA32_VMX_CR0_FIXED0: 1356 return &msrs->cr0_fixed0; 1357 case MSR_IA32_VMX_CR4_FIXED0: 1358 return &msrs->cr4_fixed0; 1359 default: 1360 BUG(); 1361 } 1362 } 1363 1364 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1365 { 1366 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1367 1368 /* 1369 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1370 * must be 1 in the restored value. 1371 */ 1372 if (!is_bitwise_subset(data, *msr, -1ULL)) 1373 return -EINVAL; 1374 1375 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1376 return 0; 1377 } 1378 1379 /* 1380 * Called when userspace is restoring VMX MSRs. 1381 * 1382 * Returns 0 on success, non-0 otherwise. 1383 */ 1384 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1385 { 1386 struct vcpu_vmx *vmx = to_vmx(vcpu); 1387 1388 /* 1389 * Don't allow changes to the VMX capability MSRs while the vCPU 1390 * is in VMX operation. 1391 */ 1392 if (vmx->nested.vmxon) 1393 return -EBUSY; 1394 1395 switch (msr_index) { 1396 case MSR_IA32_VMX_BASIC: 1397 return vmx_restore_vmx_basic(vmx, data); 1398 case MSR_IA32_VMX_PINBASED_CTLS: 1399 case MSR_IA32_VMX_PROCBASED_CTLS: 1400 case MSR_IA32_VMX_EXIT_CTLS: 1401 case MSR_IA32_VMX_ENTRY_CTLS: 1402 /* 1403 * The "non-true" VMX capability MSRs are generated from the 1404 * "true" MSRs, so we do not support restoring them directly. 1405 * 1406 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1407 * should restore the "true" MSRs with the must-be-1 bits 1408 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1409 * DEFAULT SETTINGS". 1410 */ 1411 return -EINVAL; 1412 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1413 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1414 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1415 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1416 case MSR_IA32_VMX_PROCBASED_CTLS2: 1417 return vmx_restore_control_msr(vmx, msr_index, data); 1418 case MSR_IA32_VMX_MISC: 1419 return vmx_restore_vmx_misc(vmx, data); 1420 case MSR_IA32_VMX_CR0_FIXED0: 1421 case MSR_IA32_VMX_CR4_FIXED0: 1422 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1423 case MSR_IA32_VMX_CR0_FIXED1: 1424 case MSR_IA32_VMX_CR4_FIXED1: 1425 /* 1426 * These MSRs are generated based on the vCPU's CPUID, so we 1427 * do not support restoring them directly. 1428 */ 1429 return -EINVAL; 1430 case MSR_IA32_VMX_EPT_VPID_CAP: 1431 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1432 case MSR_IA32_VMX_VMCS_ENUM: 1433 vmx->nested.msrs.vmcs_enum = data; 1434 return 0; 1435 case MSR_IA32_VMX_VMFUNC: 1436 if (data & ~vmcs_config.nested.vmfunc_controls) 1437 return -EINVAL; 1438 vmx->nested.msrs.vmfunc_controls = data; 1439 return 0; 1440 default: 1441 /* 1442 * The rest of the VMX capability MSRs do not support restore. 1443 */ 1444 return -EINVAL; 1445 } 1446 } 1447 1448 /* Returns 0 on success, non-0 otherwise. */ 1449 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1450 { 1451 switch (msr_index) { 1452 case MSR_IA32_VMX_BASIC: 1453 *pdata = msrs->basic; 1454 break; 1455 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1456 case MSR_IA32_VMX_PINBASED_CTLS: 1457 *pdata = vmx_control_msr( 1458 msrs->pinbased_ctls_low, 1459 msrs->pinbased_ctls_high); 1460 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1461 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1462 break; 1463 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1464 case MSR_IA32_VMX_PROCBASED_CTLS: 1465 *pdata = vmx_control_msr( 1466 msrs->procbased_ctls_low, 1467 msrs->procbased_ctls_high); 1468 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1469 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1470 break; 1471 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1472 case MSR_IA32_VMX_EXIT_CTLS: 1473 *pdata = vmx_control_msr( 1474 msrs->exit_ctls_low, 1475 msrs->exit_ctls_high); 1476 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1477 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1478 break; 1479 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1480 case MSR_IA32_VMX_ENTRY_CTLS: 1481 *pdata = vmx_control_msr( 1482 msrs->entry_ctls_low, 1483 msrs->entry_ctls_high); 1484 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1485 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1486 break; 1487 case MSR_IA32_VMX_MISC: 1488 *pdata = vmx_control_msr( 1489 msrs->misc_low, 1490 msrs->misc_high); 1491 break; 1492 case MSR_IA32_VMX_CR0_FIXED0: 1493 *pdata = msrs->cr0_fixed0; 1494 break; 1495 case MSR_IA32_VMX_CR0_FIXED1: 1496 *pdata = msrs->cr0_fixed1; 1497 break; 1498 case MSR_IA32_VMX_CR4_FIXED0: 1499 *pdata = msrs->cr4_fixed0; 1500 break; 1501 case MSR_IA32_VMX_CR4_FIXED1: 1502 *pdata = msrs->cr4_fixed1; 1503 break; 1504 case MSR_IA32_VMX_VMCS_ENUM: 1505 *pdata = msrs->vmcs_enum; 1506 break; 1507 case MSR_IA32_VMX_PROCBASED_CTLS2: 1508 *pdata = vmx_control_msr( 1509 msrs->secondary_ctls_low, 1510 msrs->secondary_ctls_high); 1511 break; 1512 case MSR_IA32_VMX_EPT_VPID_CAP: 1513 *pdata = msrs->ept_caps | 1514 ((u64)msrs->vpid_caps << 32); 1515 break; 1516 case MSR_IA32_VMX_VMFUNC: 1517 *pdata = msrs->vmfunc_controls; 1518 break; 1519 default: 1520 return 1; 1521 } 1522 1523 return 0; 1524 } 1525 1526 /* 1527 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1528 * been modified by the L1 guest. Note, "writable" in this context means 1529 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1530 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1531 * VM-exit information fields (which are actually writable if the vCPU is 1532 * configured to support "VMWRITE to any supported field in the VMCS"). 1533 */ 1534 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1535 { 1536 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1537 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1538 struct shadow_vmcs_field field; 1539 unsigned long val; 1540 int i; 1541 1542 if (WARN_ON(!shadow_vmcs)) 1543 return; 1544 1545 preempt_disable(); 1546 1547 vmcs_load(shadow_vmcs); 1548 1549 for (i = 0; i < max_shadow_read_write_fields; i++) { 1550 field = shadow_read_write_fields[i]; 1551 val = __vmcs_readl(field.encoding); 1552 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1553 } 1554 1555 vmcs_clear(shadow_vmcs); 1556 vmcs_load(vmx->loaded_vmcs->vmcs); 1557 1558 preempt_enable(); 1559 } 1560 1561 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1562 { 1563 const struct shadow_vmcs_field *fields[] = { 1564 shadow_read_write_fields, 1565 shadow_read_only_fields 1566 }; 1567 const int max_fields[] = { 1568 max_shadow_read_write_fields, 1569 max_shadow_read_only_fields 1570 }; 1571 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1572 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1573 struct shadow_vmcs_field field; 1574 unsigned long val; 1575 int i, q; 1576 1577 if (WARN_ON(!shadow_vmcs)) 1578 return; 1579 1580 vmcs_load(shadow_vmcs); 1581 1582 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1583 for (i = 0; i < max_fields[q]; i++) { 1584 field = fields[q][i]; 1585 val = vmcs12_read_any(vmcs12, field.encoding, 1586 field.offset); 1587 __vmcs_writel(field.encoding, val); 1588 } 1589 } 1590 1591 vmcs_clear(shadow_vmcs); 1592 vmcs_load(vmx->loaded_vmcs->vmcs); 1593 } 1594 1595 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1596 { 1597 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1598 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1599 1600 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1601 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1602 vmcs12->guest_rip = evmcs->guest_rip; 1603 1604 if (unlikely(!(hv_clean_fields & 1605 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1606 vmcs12->guest_rsp = evmcs->guest_rsp; 1607 vmcs12->guest_rflags = evmcs->guest_rflags; 1608 vmcs12->guest_interruptibility_info = 1609 evmcs->guest_interruptibility_info; 1610 } 1611 1612 if (unlikely(!(hv_clean_fields & 1613 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1614 vmcs12->cpu_based_vm_exec_control = 1615 evmcs->cpu_based_vm_exec_control; 1616 } 1617 1618 if (unlikely(!(hv_clean_fields & 1619 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1620 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1621 } 1622 1623 if (unlikely(!(hv_clean_fields & 1624 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1625 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1626 } 1627 1628 if (unlikely(!(hv_clean_fields & 1629 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1630 vmcs12->vm_entry_intr_info_field = 1631 evmcs->vm_entry_intr_info_field; 1632 vmcs12->vm_entry_exception_error_code = 1633 evmcs->vm_entry_exception_error_code; 1634 vmcs12->vm_entry_instruction_len = 1635 evmcs->vm_entry_instruction_len; 1636 } 1637 1638 if (unlikely(!(hv_clean_fields & 1639 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1640 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1641 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1642 vmcs12->host_cr0 = evmcs->host_cr0; 1643 vmcs12->host_cr3 = evmcs->host_cr3; 1644 vmcs12->host_cr4 = evmcs->host_cr4; 1645 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1646 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1647 vmcs12->host_rip = evmcs->host_rip; 1648 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1649 vmcs12->host_es_selector = evmcs->host_es_selector; 1650 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1651 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1652 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1653 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1654 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1655 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1656 } 1657 1658 if (unlikely(!(hv_clean_fields & 1659 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1660 vmcs12->pin_based_vm_exec_control = 1661 evmcs->pin_based_vm_exec_control; 1662 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1663 vmcs12->secondary_vm_exec_control = 1664 evmcs->secondary_vm_exec_control; 1665 } 1666 1667 if (unlikely(!(hv_clean_fields & 1668 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1669 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1670 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1671 } 1672 1673 if (unlikely(!(hv_clean_fields & 1674 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1675 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1676 } 1677 1678 if (unlikely(!(hv_clean_fields & 1679 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1680 vmcs12->guest_es_base = evmcs->guest_es_base; 1681 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1682 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1683 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1684 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1685 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1686 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1687 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1688 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1689 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1690 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1691 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1692 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1693 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1694 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1695 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1696 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1697 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1698 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1699 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1700 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1701 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1702 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1703 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1704 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1705 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1706 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1707 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1708 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1709 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1710 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1711 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1712 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1713 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1714 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1715 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1716 } 1717 1718 if (unlikely(!(hv_clean_fields & 1719 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1720 vmcs12->tsc_offset = evmcs->tsc_offset; 1721 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1722 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1723 } 1724 1725 if (unlikely(!(hv_clean_fields & 1726 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1727 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1728 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1729 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1730 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1731 vmcs12->guest_cr0 = evmcs->guest_cr0; 1732 vmcs12->guest_cr3 = evmcs->guest_cr3; 1733 vmcs12->guest_cr4 = evmcs->guest_cr4; 1734 vmcs12->guest_dr7 = evmcs->guest_dr7; 1735 } 1736 1737 if (unlikely(!(hv_clean_fields & 1738 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1739 vmcs12->host_fs_base = evmcs->host_fs_base; 1740 vmcs12->host_gs_base = evmcs->host_gs_base; 1741 vmcs12->host_tr_base = evmcs->host_tr_base; 1742 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1743 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1744 vmcs12->host_rsp = evmcs->host_rsp; 1745 } 1746 1747 if (unlikely(!(hv_clean_fields & 1748 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1749 vmcs12->ept_pointer = evmcs->ept_pointer; 1750 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1751 } 1752 1753 if (unlikely(!(hv_clean_fields & 1754 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1755 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1756 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1757 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1758 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1759 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1760 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1761 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1762 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1763 vmcs12->guest_pending_dbg_exceptions = 1764 evmcs->guest_pending_dbg_exceptions; 1765 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1766 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1767 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1768 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1769 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1770 } 1771 1772 /* 1773 * Not used? 1774 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1775 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1776 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1777 * vmcs12->page_fault_error_code_mask = 1778 * evmcs->page_fault_error_code_mask; 1779 * vmcs12->page_fault_error_code_match = 1780 * evmcs->page_fault_error_code_match; 1781 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1782 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1783 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1784 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1785 */ 1786 1787 /* 1788 * Read only fields: 1789 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1790 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1791 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1792 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1793 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1794 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1795 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1796 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1797 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1798 * vmcs12->exit_qualification = evmcs->exit_qualification; 1799 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1800 * 1801 * Not present in struct vmcs12: 1802 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1803 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1804 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1805 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1806 */ 1807 1808 return; 1809 } 1810 1811 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1812 { 1813 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1814 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1815 1816 /* 1817 * Should not be changed by KVM: 1818 * 1819 * evmcs->host_es_selector = vmcs12->host_es_selector; 1820 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1821 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1822 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1823 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1824 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1825 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1826 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1827 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1828 * evmcs->host_cr0 = vmcs12->host_cr0; 1829 * evmcs->host_cr3 = vmcs12->host_cr3; 1830 * evmcs->host_cr4 = vmcs12->host_cr4; 1831 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1832 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1833 * evmcs->host_rip = vmcs12->host_rip; 1834 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1835 * evmcs->host_fs_base = vmcs12->host_fs_base; 1836 * evmcs->host_gs_base = vmcs12->host_gs_base; 1837 * evmcs->host_tr_base = vmcs12->host_tr_base; 1838 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1839 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1840 * evmcs->host_rsp = vmcs12->host_rsp; 1841 * sync_vmcs02_to_vmcs12() doesn't read these: 1842 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1843 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1844 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1845 * evmcs->ept_pointer = vmcs12->ept_pointer; 1846 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1847 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1848 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1849 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1850 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1851 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1852 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1853 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1854 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1855 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1856 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1857 * evmcs->page_fault_error_code_mask = 1858 * vmcs12->page_fault_error_code_mask; 1859 * evmcs->page_fault_error_code_match = 1860 * vmcs12->page_fault_error_code_match; 1861 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1862 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1863 * evmcs->tsc_offset = vmcs12->tsc_offset; 1864 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1865 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1866 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1867 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1868 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1869 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1870 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1871 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1872 * 1873 * Not present in struct vmcs12: 1874 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1875 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1876 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1877 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1878 */ 1879 1880 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1881 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1882 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1883 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1884 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1885 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1886 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1887 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1888 1889 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1890 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1891 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1892 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1893 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1894 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1895 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1896 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1897 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1898 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1899 1900 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1901 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1902 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1903 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1904 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1905 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1906 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1907 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1908 1909 evmcs->guest_es_base = vmcs12->guest_es_base; 1910 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1911 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1912 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1913 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1914 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1915 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1916 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1917 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1918 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1919 1920 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1921 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1922 1923 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1924 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1925 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1926 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1927 1928 evmcs->guest_pending_dbg_exceptions = 1929 vmcs12->guest_pending_dbg_exceptions; 1930 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1931 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1932 1933 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1934 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1935 1936 evmcs->guest_cr0 = vmcs12->guest_cr0; 1937 evmcs->guest_cr3 = vmcs12->guest_cr3; 1938 evmcs->guest_cr4 = vmcs12->guest_cr4; 1939 evmcs->guest_dr7 = vmcs12->guest_dr7; 1940 1941 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1942 1943 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1944 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1945 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1946 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1947 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1948 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1949 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1950 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1951 1952 evmcs->exit_qualification = vmcs12->exit_qualification; 1953 1954 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1955 evmcs->guest_rsp = vmcs12->guest_rsp; 1956 evmcs->guest_rflags = vmcs12->guest_rflags; 1957 1958 evmcs->guest_interruptibility_info = 1959 vmcs12->guest_interruptibility_info; 1960 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1961 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1962 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1963 evmcs->vm_entry_exception_error_code = 1964 vmcs12->vm_entry_exception_error_code; 1965 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1966 1967 evmcs->guest_rip = vmcs12->guest_rip; 1968 1969 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1970 1971 return; 1972 } 1973 1974 /* 1975 * This is an equivalent of the nested hypervisor executing the vmptrld 1976 * instruction. 1977 */ 1978 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1979 struct kvm_vcpu *vcpu, bool from_launch) 1980 { 1981 struct vcpu_vmx *vmx = to_vmx(vcpu); 1982 bool evmcs_gpa_changed = false; 1983 u64 evmcs_gpa; 1984 1985 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1986 return EVMPTRLD_DISABLED; 1987 1988 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { 1989 nested_release_evmcs(vcpu); 1990 return EVMPTRLD_DISABLED; 1991 } 1992 1993 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1994 vmx->nested.current_vmptr = INVALID_GPA; 1995 1996 nested_release_evmcs(vcpu); 1997 1998 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1999 &vmx->nested.hv_evmcs_map)) 2000 return EVMPTRLD_ERROR; 2001 2002 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2003 2004 /* 2005 * Currently, KVM only supports eVMCS version 1 2006 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2007 * value to first u32 field of eVMCS which should specify eVMCS 2008 * VersionNumber. 2009 * 2010 * Guest should be aware of supported eVMCS versions by host by 2011 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2012 * expected to set this CPUID leaf according to the value 2013 * returned in vmcs_version from nested_enable_evmcs(). 2014 * 2015 * However, it turns out that Microsoft Hyper-V fails to comply 2016 * to their own invented interface: When Hyper-V use eVMCS, it 2017 * just sets first u32 field of eVMCS to revision_id specified 2018 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2019 * which is one of the supported versions specified in 2020 * CPUID.0x4000000A.EAX[0:15]. 2021 * 2022 * To overcome Hyper-V bug, we accept here either a supported 2023 * eVMCS version or VMCS12 revision_id as valid values for first 2024 * u32 field of eVMCS. 2025 */ 2026 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2027 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2028 nested_release_evmcs(vcpu); 2029 return EVMPTRLD_VMFAIL; 2030 } 2031 2032 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2033 2034 evmcs_gpa_changed = true; 2035 /* 2036 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2037 * reloaded from guest's memory (read only fields, fields not 2038 * present in struct hv_enlightened_vmcs, ...). Make sure there 2039 * are no leftovers. 2040 */ 2041 if (from_launch) { 2042 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2043 memset(vmcs12, 0, sizeof(*vmcs12)); 2044 vmcs12->hdr.revision_id = VMCS12_REVISION; 2045 } 2046 2047 } 2048 2049 /* 2050 * Clean fields data can't be used on VMLAUNCH and when we switch 2051 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2052 */ 2053 if (from_launch || evmcs_gpa_changed) { 2054 vmx->nested.hv_evmcs->hv_clean_fields &= 2055 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2056 2057 vmx->nested.force_msr_bitmap_recalc = true; 2058 } 2059 2060 return EVMPTRLD_SUCCEEDED; 2061 } 2062 2063 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2064 { 2065 struct vcpu_vmx *vmx = to_vmx(vcpu); 2066 2067 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2068 copy_vmcs12_to_enlightened(vmx); 2069 else 2070 copy_vmcs12_to_shadow(vmx); 2071 2072 vmx->nested.need_vmcs12_to_shadow_sync = false; 2073 } 2074 2075 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2076 { 2077 struct vcpu_vmx *vmx = 2078 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2079 2080 vmx->nested.preemption_timer_expired = true; 2081 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2082 kvm_vcpu_kick(&vmx->vcpu); 2083 2084 return HRTIMER_NORESTART; 2085 } 2086 2087 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2088 { 2089 struct vcpu_vmx *vmx = to_vmx(vcpu); 2090 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2091 2092 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2093 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2094 2095 if (!vmx->nested.has_preemption_timer_deadline) { 2096 vmx->nested.preemption_timer_deadline = 2097 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2098 vmx->nested.has_preemption_timer_deadline = true; 2099 } 2100 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2101 } 2102 2103 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2104 u64 preemption_timeout) 2105 { 2106 struct vcpu_vmx *vmx = to_vmx(vcpu); 2107 2108 /* 2109 * A timer value of zero is architecturally guaranteed to cause 2110 * a VMExit prior to executing any instructions in the guest. 2111 */ 2112 if (preemption_timeout == 0) { 2113 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2114 return; 2115 } 2116 2117 if (vcpu->arch.virtual_tsc_khz == 0) 2118 return; 2119 2120 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2121 preemption_timeout *= 1000000; 2122 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2123 hrtimer_start(&vmx->nested.preemption_timer, 2124 ktime_add_ns(ktime_get(), preemption_timeout), 2125 HRTIMER_MODE_ABS_PINNED); 2126 } 2127 2128 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2129 { 2130 if (vmx->nested.nested_run_pending && 2131 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2132 return vmcs12->guest_ia32_efer; 2133 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2134 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2135 else 2136 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2137 } 2138 2139 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2140 { 2141 struct kvm *kvm = vmx->vcpu.kvm; 2142 2143 /* 2144 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2145 * according to L0's settings (vmcs12 is irrelevant here). Host 2146 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2147 * will be set as needed prior to VMLAUNCH/VMRESUME. 2148 */ 2149 if (vmx->nested.vmcs02_initialized) 2150 return; 2151 vmx->nested.vmcs02_initialized = true; 2152 2153 /* 2154 * We don't care what the EPTP value is we just need to guarantee 2155 * it's valid so we don't get a false positive when doing early 2156 * consistency checks. 2157 */ 2158 if (enable_ept && nested_early_check) 2159 vmcs_write64(EPT_POINTER, 2160 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2161 2162 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2163 if (cpu_has_vmx_vmfunc()) 2164 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2165 2166 if (cpu_has_vmx_posted_intr()) 2167 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2168 2169 if (cpu_has_vmx_msr_bitmap()) 2170 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2171 2172 /* 2173 * PML is emulated for L2, but never enabled in hardware as the MMU 2174 * handles A/D emulation. Disabling PML for L2 also avoids having to 2175 * deal with filtering out L2 GPAs from the buffer. 2176 */ 2177 if (enable_pml) { 2178 vmcs_write64(PML_ADDRESS, 0); 2179 vmcs_write16(GUEST_PML_INDEX, -1); 2180 } 2181 2182 if (cpu_has_vmx_encls_vmexit()) 2183 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2184 2185 if (kvm_notify_vmexit_enabled(kvm)) 2186 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2187 2188 /* 2189 * Set the MSR load/store lists to match L0's settings. Only the 2190 * addresses are constant (for vmcs02), the counts can change based 2191 * on L2's behavior, e.g. switching to/from long mode. 2192 */ 2193 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2194 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2195 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2196 2197 vmx_set_constant_host_state(vmx); 2198 } 2199 2200 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2201 struct vmcs12 *vmcs12) 2202 { 2203 prepare_vmcs02_constant_state(vmx); 2204 2205 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2206 2207 if (enable_vpid) { 2208 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2209 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2210 else 2211 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2212 } 2213 } 2214 2215 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2216 struct vmcs12 *vmcs12) 2217 { 2218 u32 exec_control; 2219 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2220 2221 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2222 prepare_vmcs02_early_rare(vmx, vmcs12); 2223 2224 /* 2225 * PIN CONTROLS 2226 */ 2227 exec_control = __pin_controls_get(vmcs01); 2228 exec_control |= (vmcs12->pin_based_vm_exec_control & 2229 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2230 2231 /* Posted interrupts setting is only taken from vmcs12. */ 2232 vmx->nested.pi_pending = false; 2233 if (nested_cpu_has_posted_intr(vmcs12)) 2234 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2235 else 2236 exec_control &= ~PIN_BASED_POSTED_INTR; 2237 pin_controls_set(vmx, exec_control); 2238 2239 /* 2240 * EXEC CONTROLS 2241 */ 2242 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2243 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2244 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2245 exec_control &= ~CPU_BASED_TPR_SHADOW; 2246 exec_control |= vmcs12->cpu_based_vm_exec_control; 2247 2248 vmx->nested.l1_tpr_threshold = -1; 2249 if (exec_control & CPU_BASED_TPR_SHADOW) 2250 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2251 #ifdef CONFIG_X86_64 2252 else 2253 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2254 CPU_BASED_CR8_STORE_EXITING; 2255 #endif 2256 2257 /* 2258 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2259 * for I/O port accesses. 2260 */ 2261 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2262 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2263 2264 /* 2265 * This bit will be computed in nested_get_vmcs12_pages, because 2266 * we do not have access to L1's MSR bitmap yet. For now, keep 2267 * the same bit as before, hoping to avoid multiple VMWRITEs that 2268 * only set/clear this bit. 2269 */ 2270 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2271 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2272 2273 exec_controls_set(vmx, exec_control); 2274 2275 /* 2276 * SECONDARY EXEC CONTROLS 2277 */ 2278 if (cpu_has_secondary_exec_ctrls()) { 2279 exec_control = __secondary_exec_controls_get(vmcs01); 2280 2281 /* Take the following fields only from vmcs12 */ 2282 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2283 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2284 SECONDARY_EXEC_ENABLE_INVPCID | 2285 SECONDARY_EXEC_ENABLE_RDTSCP | 2286 SECONDARY_EXEC_XSAVES | 2287 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2288 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2289 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2290 SECONDARY_EXEC_ENABLE_VMFUNC | 2291 SECONDARY_EXEC_DESC); 2292 2293 if (nested_cpu_has(vmcs12, 2294 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2295 exec_control |= vmcs12->secondary_vm_exec_control; 2296 2297 /* PML is emulated and never enabled in hardware for L2. */ 2298 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2299 2300 /* VMCS shadowing for L2 is emulated for now */ 2301 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2302 2303 /* 2304 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2305 * will not have to rewrite the controls just for this bit. 2306 */ 2307 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2308 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2309 exec_control |= SECONDARY_EXEC_DESC; 2310 2311 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2312 vmcs_write16(GUEST_INTR_STATUS, 2313 vmcs12->guest_intr_status); 2314 2315 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2316 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2317 2318 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2319 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2320 2321 secondary_exec_controls_set(vmx, exec_control); 2322 } 2323 2324 /* 2325 * ENTRY CONTROLS 2326 * 2327 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2328 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2329 * on the related bits (if supported by the CPU) in the hope that 2330 * we can avoid VMWrites during vmx_set_efer(). 2331 */ 2332 exec_control = __vm_entry_controls_get(vmcs01); 2333 exec_control |= vmcs12->vm_entry_controls; 2334 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2335 if (cpu_has_load_ia32_efer()) { 2336 if (guest_efer & EFER_LMA) 2337 exec_control |= VM_ENTRY_IA32E_MODE; 2338 if (guest_efer != host_efer) 2339 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2340 } 2341 vm_entry_controls_set(vmx, exec_control); 2342 2343 /* 2344 * EXIT CONTROLS 2345 * 2346 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2347 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2348 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2349 */ 2350 exec_control = __vm_exit_controls_get(vmcs01); 2351 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2352 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2353 else 2354 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2355 vm_exit_controls_set(vmx, exec_control); 2356 2357 /* 2358 * Interrupt/Exception Fields 2359 */ 2360 if (vmx->nested.nested_run_pending) { 2361 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2362 vmcs12->vm_entry_intr_info_field); 2363 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2364 vmcs12->vm_entry_exception_error_code); 2365 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2366 vmcs12->vm_entry_instruction_len); 2367 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2368 vmcs12->guest_interruptibility_info); 2369 vmx->loaded_vmcs->nmi_known_unmasked = 2370 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2371 } else { 2372 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2373 } 2374 } 2375 2376 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2377 { 2378 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2379 2380 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2381 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2382 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2383 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2384 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2385 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2386 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2387 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2388 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2389 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2390 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2391 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2392 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2393 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2394 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2395 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2396 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2397 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2398 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2399 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2400 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2401 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2402 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2403 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2404 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2405 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2406 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2407 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2408 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2409 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2410 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2411 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2412 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2413 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2414 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2415 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2416 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2417 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2418 2419 vmx->segment_cache.bitmask = 0; 2420 } 2421 2422 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2423 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2424 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2425 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2426 vmcs12->guest_pending_dbg_exceptions); 2427 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2428 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2429 2430 /* 2431 * L1 may access the L2's PDPTR, so save them to construct 2432 * vmcs12 2433 */ 2434 if (enable_ept) { 2435 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2436 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2437 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2438 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2439 } 2440 2441 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2442 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2443 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2444 } 2445 2446 if (nested_cpu_has_xsaves(vmcs12)) 2447 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2448 2449 /* 2450 * Whether page-faults are trapped is determined by a combination of 2451 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2452 * doesn't care about page faults then we should set all of these to 2453 * L1's desires. However, if L0 does care about (some) page faults, it 2454 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2455 * simply ask to exit on each and every L2 page fault. This is done by 2456 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2457 * Note that below we don't need special code to set EB.PF beyond the 2458 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2459 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2460 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2461 */ 2462 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2463 /* 2464 * TODO: if both L0 and L1 need the same MASK and MATCH, 2465 * go ahead and use it? 2466 */ 2467 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2468 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2469 } else { 2470 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2471 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2472 } 2473 2474 if (cpu_has_vmx_apicv()) { 2475 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2476 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2477 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2478 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2479 } 2480 2481 /* 2482 * Make sure the msr_autostore list is up to date before we set the 2483 * count in the vmcs02. 2484 */ 2485 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2486 2487 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2488 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2489 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2490 2491 set_cr4_guest_host_mask(vmx); 2492 } 2493 2494 /* 2495 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2496 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2497 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2498 * guest in a way that will both be appropriate to L1's requests, and our 2499 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2500 * function also has additional necessary side-effects, like setting various 2501 * vcpu->arch fields. 2502 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2503 * is assigned to entry_failure_code on failure. 2504 */ 2505 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2506 bool from_vmentry, 2507 enum vm_entry_failure_code *entry_failure_code) 2508 { 2509 struct vcpu_vmx *vmx = to_vmx(vcpu); 2510 bool load_guest_pdptrs_vmcs12 = false; 2511 2512 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 2513 prepare_vmcs02_rare(vmx, vmcs12); 2514 vmx->nested.dirty_vmcs12 = false; 2515 2516 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || 2517 !(vmx->nested.hv_evmcs->hv_clean_fields & 2518 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2519 } 2520 2521 if (vmx->nested.nested_run_pending && 2522 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2523 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2524 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2525 } else { 2526 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2527 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2528 } 2529 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2530 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2531 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2532 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2533 2534 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2535 * bitwise-or of what L1 wants to trap for L2, and what we want to 2536 * trap. Note that CR0.TS also needs updating - we do this later. 2537 */ 2538 vmx_update_exception_bitmap(vcpu); 2539 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2540 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2541 2542 if (vmx->nested.nested_run_pending && 2543 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2544 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2545 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2546 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2547 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2548 } 2549 2550 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2551 vcpu->arch.l1_tsc_offset, 2552 vmx_get_l2_tsc_offset(vcpu), 2553 vmx_get_l2_tsc_multiplier(vcpu)); 2554 2555 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2556 vcpu->arch.l1_tsc_scaling_ratio, 2557 vmx_get_l2_tsc_multiplier(vcpu)); 2558 2559 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2560 if (kvm_caps.has_tsc_control) 2561 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2562 2563 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2564 2565 if (nested_cpu_has_ept(vmcs12)) 2566 nested_ept_init_mmu_context(vcpu); 2567 2568 /* 2569 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2570 * bits which we consider mandatory enabled. 2571 * The CR0_READ_SHADOW is what L2 should have expected to read given 2572 * the specifications by L1; It's not enough to take 2573 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we 2574 * have more bits than L1 expected. 2575 */ 2576 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2577 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2578 2579 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2580 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2581 2582 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2583 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2584 vmx_set_efer(vcpu, vcpu->arch.efer); 2585 2586 /* 2587 * Guest state is invalid and unrestricted guest is disabled, 2588 * which means L1 attempted VMEntry to L2 with invalid state. 2589 * Fail the VMEntry. 2590 * 2591 * However when force loading the guest state (SMM exit or 2592 * loading nested state after migration, it is possible to 2593 * have invalid guest state now, which will be later fixed by 2594 * restoring L2 register state 2595 */ 2596 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2597 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2598 return -EINVAL; 2599 } 2600 2601 /* Shadow page tables on either EPT or shadow page tables. */ 2602 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2603 from_vmentry, entry_failure_code)) 2604 return -EINVAL; 2605 2606 /* 2607 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2608 * on nested VM-Exit, which can occur without actually running L2 and 2609 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2610 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2611 * transition to HLT instead of running L2. 2612 */ 2613 if (enable_ept) 2614 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2615 2616 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2617 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2618 is_pae_paging(vcpu)) { 2619 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2620 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2621 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2622 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2623 } 2624 2625 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2626 intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2627 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2628 vmcs12->guest_ia32_perf_global_ctrl))) { 2629 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2630 return -EINVAL; 2631 } 2632 2633 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2634 kvm_rip_write(vcpu, vmcs12->guest_rip); 2635 2636 /* 2637 * It was observed that genuine Hyper-V running in L1 doesn't reset 2638 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2639 * bits when it changes a field in eVMCS. Mark all fields as clean 2640 * here. 2641 */ 2642 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2643 vmx->nested.hv_evmcs->hv_clean_fields |= 2644 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2645 2646 return 0; 2647 } 2648 2649 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2650 { 2651 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2652 nested_cpu_has_virtual_nmis(vmcs12))) 2653 return -EINVAL; 2654 2655 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2656 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2657 return -EINVAL; 2658 2659 return 0; 2660 } 2661 2662 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2663 { 2664 struct vcpu_vmx *vmx = to_vmx(vcpu); 2665 2666 /* Check for memory type validity */ 2667 switch (new_eptp & VMX_EPTP_MT_MASK) { 2668 case VMX_EPTP_MT_UC: 2669 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2670 return false; 2671 break; 2672 case VMX_EPTP_MT_WB: 2673 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2674 return false; 2675 break; 2676 default: 2677 return false; 2678 } 2679 2680 /* Page-walk levels validity. */ 2681 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2682 case VMX_EPTP_PWL_5: 2683 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2684 return false; 2685 break; 2686 case VMX_EPTP_PWL_4: 2687 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2688 return false; 2689 break; 2690 default: 2691 return false; 2692 } 2693 2694 /* Reserved bits should not be set */ 2695 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2696 return false; 2697 2698 /* AD, if set, should be supported */ 2699 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2700 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2701 return false; 2702 } 2703 2704 return true; 2705 } 2706 2707 /* 2708 * Checks related to VM-Execution Control Fields 2709 */ 2710 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2711 struct vmcs12 *vmcs12) 2712 { 2713 struct vcpu_vmx *vmx = to_vmx(vcpu); 2714 2715 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2716 vmx->nested.msrs.pinbased_ctls_low, 2717 vmx->nested.msrs.pinbased_ctls_high)) || 2718 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2719 vmx->nested.msrs.procbased_ctls_low, 2720 vmx->nested.msrs.procbased_ctls_high))) 2721 return -EINVAL; 2722 2723 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2724 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2725 vmx->nested.msrs.secondary_ctls_low, 2726 vmx->nested.msrs.secondary_ctls_high))) 2727 return -EINVAL; 2728 2729 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2730 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2731 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2732 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2733 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2734 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2735 nested_vmx_check_nmi_controls(vmcs12) || 2736 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2737 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2738 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2739 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2740 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2741 return -EINVAL; 2742 2743 if (!nested_cpu_has_preemption_timer(vmcs12) && 2744 nested_cpu_has_save_preemption_timer(vmcs12)) 2745 return -EINVAL; 2746 2747 if (nested_cpu_has_ept(vmcs12) && 2748 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2749 return -EINVAL; 2750 2751 if (nested_cpu_has_vmfunc(vmcs12)) { 2752 if (CC(vmcs12->vm_function_control & 2753 ~vmx->nested.msrs.vmfunc_controls)) 2754 return -EINVAL; 2755 2756 if (nested_cpu_has_eptp_switching(vmcs12)) { 2757 if (CC(!nested_cpu_has_ept(vmcs12)) || 2758 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2759 return -EINVAL; 2760 } 2761 } 2762 2763 return 0; 2764 } 2765 2766 /* 2767 * Checks related to VM-Exit Control Fields 2768 */ 2769 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2770 struct vmcs12 *vmcs12) 2771 { 2772 struct vcpu_vmx *vmx = to_vmx(vcpu); 2773 2774 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2775 vmx->nested.msrs.exit_ctls_low, 2776 vmx->nested.msrs.exit_ctls_high)) || 2777 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2778 return -EINVAL; 2779 2780 return 0; 2781 } 2782 2783 /* 2784 * Checks related to VM-Entry Control Fields 2785 */ 2786 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2787 struct vmcs12 *vmcs12) 2788 { 2789 struct vcpu_vmx *vmx = to_vmx(vcpu); 2790 2791 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2792 vmx->nested.msrs.entry_ctls_low, 2793 vmx->nested.msrs.entry_ctls_high))) 2794 return -EINVAL; 2795 2796 /* 2797 * From the Intel SDM, volume 3: 2798 * Fields relevant to VM-entry event injection must be set properly. 2799 * These fields are the VM-entry interruption-information field, the 2800 * VM-entry exception error code, and the VM-entry instruction length. 2801 */ 2802 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2803 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2804 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2805 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2806 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2807 bool should_have_error_code; 2808 bool urg = nested_cpu_has2(vmcs12, 2809 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2810 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2811 2812 /* VM-entry interruption-info field: interruption type */ 2813 if (CC(intr_type == INTR_TYPE_RESERVED) || 2814 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2815 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2816 return -EINVAL; 2817 2818 /* VM-entry interruption-info field: vector */ 2819 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2820 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2821 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2822 return -EINVAL; 2823 2824 /* VM-entry interruption-info field: deliver error code */ 2825 should_have_error_code = 2826 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2827 x86_exception_has_error_code(vector); 2828 if (CC(has_error_code != should_have_error_code)) 2829 return -EINVAL; 2830 2831 /* VM-entry exception error code */ 2832 if (CC(has_error_code && 2833 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2834 return -EINVAL; 2835 2836 /* VM-entry interruption-info field: reserved bits */ 2837 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2838 return -EINVAL; 2839 2840 /* VM-entry instruction length */ 2841 switch (intr_type) { 2842 case INTR_TYPE_SOFT_EXCEPTION: 2843 case INTR_TYPE_SOFT_INTR: 2844 case INTR_TYPE_PRIV_SW_EXCEPTION: 2845 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2846 CC(vmcs12->vm_entry_instruction_len == 0 && 2847 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2848 return -EINVAL; 2849 } 2850 } 2851 2852 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2853 return -EINVAL; 2854 2855 return 0; 2856 } 2857 2858 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2859 struct vmcs12 *vmcs12) 2860 { 2861 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2862 nested_check_vm_exit_controls(vcpu, vmcs12) || 2863 nested_check_vm_entry_controls(vcpu, vmcs12)) 2864 return -EINVAL; 2865 2866 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) 2867 return nested_evmcs_check_controls(vmcs12); 2868 2869 return 0; 2870 } 2871 2872 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2873 struct vmcs12 *vmcs12) 2874 { 2875 #ifdef CONFIG_X86_64 2876 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2877 !!(vcpu->arch.efer & EFER_LMA))) 2878 return -EINVAL; 2879 #endif 2880 return 0; 2881 } 2882 2883 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2884 struct vmcs12 *vmcs12) 2885 { 2886 bool ia32e; 2887 2888 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2889 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2890 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2891 return -EINVAL; 2892 2893 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2894 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2895 return -EINVAL; 2896 2897 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2898 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2899 return -EINVAL; 2900 2901 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2902 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2903 vmcs12->host_ia32_perf_global_ctrl))) 2904 return -EINVAL; 2905 2906 #ifdef CONFIG_X86_64 2907 ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2908 #else 2909 ia32e = false; 2910 #endif 2911 2912 if (ia32e) { 2913 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2914 return -EINVAL; 2915 } else { 2916 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2917 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2918 CC((vmcs12->host_rip) >> 32)) 2919 return -EINVAL; 2920 } 2921 2922 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2923 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2924 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2925 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2926 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2927 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2928 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2929 CC(vmcs12->host_cs_selector == 0) || 2930 CC(vmcs12->host_tr_selector == 0) || 2931 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2932 return -EINVAL; 2933 2934 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2935 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2936 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2937 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2938 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2939 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2940 return -EINVAL; 2941 2942 /* 2943 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2944 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2945 * the values of the LMA and LME bits in the field must each be that of 2946 * the host address-space size VM-exit control. 2947 */ 2948 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2949 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2950 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2951 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2952 return -EINVAL; 2953 } 2954 2955 return 0; 2956 } 2957 2958 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2959 struct vmcs12 *vmcs12) 2960 { 2961 struct vcpu_vmx *vmx = to_vmx(vcpu); 2962 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 2963 struct vmcs_hdr hdr; 2964 2965 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 2966 return 0; 2967 2968 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2969 return -EINVAL; 2970 2971 if (ghc->gpa != vmcs12->vmcs_link_pointer && 2972 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 2973 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 2974 return -EINVAL; 2975 2976 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 2977 offsetof(struct vmcs12, hdr), 2978 sizeof(hdr)))) 2979 return -EINVAL; 2980 2981 if (CC(hdr.revision_id != VMCS12_REVISION) || 2982 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2983 return -EINVAL; 2984 2985 return 0; 2986 } 2987 2988 /* 2989 * Checks related to Guest Non-register State 2990 */ 2991 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2992 { 2993 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2994 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 2995 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 2996 return -EINVAL; 2997 2998 return 0; 2999 } 3000 3001 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3002 struct vmcs12 *vmcs12, 3003 enum vm_entry_failure_code *entry_failure_code) 3004 { 3005 bool ia32e; 3006 3007 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3008 3009 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3010 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3011 return -EINVAL; 3012 3013 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3014 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3015 return -EINVAL; 3016 3017 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3018 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3019 return -EINVAL; 3020 3021 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3022 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3023 return -EINVAL; 3024 } 3025 3026 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3027 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3028 vmcs12->guest_ia32_perf_global_ctrl))) 3029 return -EINVAL; 3030 3031 /* 3032 * If the load IA32_EFER VM-entry control is 1, the following checks 3033 * are performed on the field for the IA32_EFER MSR: 3034 * - Bits reserved in the IA32_EFER MSR must be 0. 3035 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3036 * the IA-32e mode guest VM-exit control. It must also be identical 3037 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3038 * CR0.PG) is 1. 3039 */ 3040 if (to_vmx(vcpu)->nested.nested_run_pending && 3041 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3042 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 3043 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3044 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3045 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3046 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3047 return -EINVAL; 3048 } 3049 3050 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3051 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3052 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3053 return -EINVAL; 3054 3055 if (nested_check_guest_non_reg_state(vmcs12)) 3056 return -EINVAL; 3057 3058 return 0; 3059 } 3060 3061 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3062 { 3063 struct vcpu_vmx *vmx = to_vmx(vcpu); 3064 unsigned long cr3, cr4; 3065 bool vm_fail; 3066 3067 if (!nested_early_check) 3068 return 0; 3069 3070 if (vmx->msr_autoload.host.nr) 3071 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3072 if (vmx->msr_autoload.guest.nr) 3073 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3074 3075 preempt_disable(); 3076 3077 vmx_prepare_switch_to_guest(vcpu); 3078 3079 /* 3080 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3081 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3082 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3083 * there is no need to preserve other bits or save/restore the field. 3084 */ 3085 vmcs_writel(GUEST_RFLAGS, 0); 3086 3087 cr3 = __get_current_cr3_fast(); 3088 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3089 vmcs_writel(HOST_CR3, cr3); 3090 vmx->loaded_vmcs->host_state.cr3 = cr3; 3091 } 3092 3093 cr4 = cr4_read_shadow(); 3094 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3095 vmcs_writel(HOST_CR4, cr4); 3096 vmx->loaded_vmcs->host_state.cr4 = cr4; 3097 } 3098 3099 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3100 __vmx_vcpu_run_flags(vmx)); 3101 3102 if (vmx->msr_autoload.host.nr) 3103 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3104 if (vmx->msr_autoload.guest.nr) 3105 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3106 3107 if (vm_fail) { 3108 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3109 3110 preempt_enable(); 3111 3112 trace_kvm_nested_vmenter_failed( 3113 "early hardware check VM-instruction error: ", error); 3114 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3115 return 1; 3116 } 3117 3118 /* 3119 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3120 */ 3121 if (hw_breakpoint_active()) 3122 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3123 local_irq_enable(); 3124 preempt_enable(); 3125 3126 /* 3127 * A non-failing VMEntry means we somehow entered guest mode with 3128 * an illegal RIP, and that's just the tip of the iceberg. There 3129 * is no telling what memory has been modified or what state has 3130 * been exposed to unknown code. Hitting this all but guarantees 3131 * a (very critical) hardware issue. 3132 */ 3133 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3134 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3135 3136 return 0; 3137 } 3138 3139 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3140 { 3141 struct vcpu_vmx *vmx = to_vmx(vcpu); 3142 3143 /* 3144 * hv_evmcs may end up being not mapped after migration (when 3145 * L2 was running), map it here to make sure vmcs12 changes are 3146 * properly reflected. 3147 */ 3148 if (vmx->nested.enlightened_vmcs_enabled && 3149 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3150 enum nested_evmptrld_status evmptrld_status = 3151 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3152 3153 if (evmptrld_status == EVMPTRLD_VMFAIL || 3154 evmptrld_status == EVMPTRLD_ERROR) 3155 return false; 3156 3157 /* 3158 * Post migration VMCS12 always provides the most actual 3159 * information, copy it to eVMCS upon entry. 3160 */ 3161 vmx->nested.need_vmcs12_to_shadow_sync = true; 3162 } 3163 3164 return true; 3165 } 3166 3167 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3168 { 3169 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3170 struct vcpu_vmx *vmx = to_vmx(vcpu); 3171 struct kvm_host_map *map; 3172 3173 if (!vcpu->arch.pdptrs_from_userspace && 3174 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3175 /* 3176 * Reload the guest's PDPTRs since after a migration 3177 * the guest CR3 might be restored prior to setting the nested 3178 * state which can lead to a load of wrong PDPTRs. 3179 */ 3180 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3181 return false; 3182 } 3183 3184 3185 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3186 map = &vmx->nested.apic_access_page_map; 3187 3188 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3189 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3190 } else { 3191 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3192 __func__); 3193 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3194 vcpu->run->internal.suberror = 3195 KVM_INTERNAL_ERROR_EMULATION; 3196 vcpu->run->internal.ndata = 0; 3197 return false; 3198 } 3199 } 3200 3201 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3202 map = &vmx->nested.virtual_apic_map; 3203 3204 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3205 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3206 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3207 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3208 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3209 /* 3210 * The processor will never use the TPR shadow, simply 3211 * clear the bit from the execution control. Such a 3212 * configuration is useless, but it happens in tests. 3213 * For any other configuration, failing the vm entry is 3214 * _not_ what the processor does but it's basically the 3215 * only possibility we have. 3216 */ 3217 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3218 } else { 3219 /* 3220 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3221 * force VM-Entry to fail. 3222 */ 3223 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3224 } 3225 } 3226 3227 if (nested_cpu_has_posted_intr(vmcs12)) { 3228 map = &vmx->nested.pi_desc_map; 3229 3230 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3231 vmx->nested.pi_desc = 3232 (struct pi_desc *)(((void *)map->hva) + 3233 offset_in_page(vmcs12->posted_intr_desc_addr)); 3234 vmcs_write64(POSTED_INTR_DESC_ADDR, 3235 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3236 } else { 3237 /* 3238 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3239 * access the contents of the VMCS12 posted interrupt 3240 * descriptor. (Note that KVM may do this when it 3241 * should not, per the architectural specification.) 3242 */ 3243 vmx->nested.pi_desc = NULL; 3244 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3245 } 3246 } 3247 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3248 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3249 else 3250 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3251 3252 return true; 3253 } 3254 3255 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3256 { 3257 if (!nested_get_evmcs_page(vcpu)) { 3258 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3259 __func__); 3260 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3261 vcpu->run->internal.suberror = 3262 KVM_INTERNAL_ERROR_EMULATION; 3263 vcpu->run->internal.ndata = 0; 3264 3265 return false; 3266 } 3267 3268 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3269 return false; 3270 3271 return true; 3272 } 3273 3274 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3275 { 3276 struct vmcs12 *vmcs12; 3277 struct vcpu_vmx *vmx = to_vmx(vcpu); 3278 gpa_t dst; 3279 3280 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3281 return 0; 3282 3283 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3284 return 1; 3285 3286 /* 3287 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3288 * set is already checked as part of A/D emulation. 3289 */ 3290 vmcs12 = get_vmcs12(vcpu); 3291 if (!nested_cpu_has_pml(vmcs12)) 3292 return 0; 3293 3294 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3295 vmx->nested.pml_full = true; 3296 return 1; 3297 } 3298 3299 gpa &= ~0xFFFull; 3300 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3301 3302 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3303 offset_in_page(dst), sizeof(gpa))) 3304 return 0; 3305 3306 vmcs12->guest_pml_index--; 3307 3308 return 0; 3309 } 3310 3311 /* 3312 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3313 * for running VMX instructions (except VMXON, whose prerequisites are 3314 * slightly different). It also specifies what exception to inject otherwise. 3315 * Note that many of these exceptions have priority over VM exits, so they 3316 * don't have to be checked again here. 3317 */ 3318 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3319 { 3320 if (!to_vmx(vcpu)->nested.vmxon) { 3321 kvm_queue_exception(vcpu, UD_VECTOR); 3322 return 0; 3323 } 3324 3325 if (vmx_get_cpl(vcpu)) { 3326 kvm_inject_gp(vcpu, 0); 3327 return 0; 3328 } 3329 3330 return 1; 3331 } 3332 3333 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3334 { 3335 u8 rvi = vmx_get_rvi(); 3336 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3337 3338 return ((rvi & 0xf0) > (vppr & 0xf0)); 3339 } 3340 3341 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3342 struct vmcs12 *vmcs12); 3343 3344 /* 3345 * If from_vmentry is false, this is being called from state restore (either RSM 3346 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3347 * 3348 * Returns: 3349 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3350 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3351 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3352 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3353 */ 3354 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3355 bool from_vmentry) 3356 { 3357 struct vcpu_vmx *vmx = to_vmx(vcpu); 3358 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3359 enum vm_entry_failure_code entry_failure_code; 3360 bool evaluate_pending_interrupts; 3361 union vmx_exit_reason exit_reason = { 3362 .basic = EXIT_REASON_INVALID_STATE, 3363 .failed_vmentry = 1, 3364 }; 3365 u32 failed_index; 3366 3367 kvm_service_local_tlb_flush_requests(vcpu); 3368 3369 evaluate_pending_interrupts = exec_controls_get(vmx) & 3370 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3371 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3372 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3373 3374 if (!vmx->nested.nested_run_pending || 3375 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3376 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3377 if (kvm_mpx_supported() && 3378 (!vmx->nested.nested_run_pending || 3379 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3380 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3381 3382 /* 3383 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3384 * nested early checks are disabled. In the event of a "late" VM-Fail, 3385 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3386 * software model to the pre-VMEntry host state. When EPT is disabled, 3387 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3388 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3389 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3390 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3391 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3392 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3393 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3394 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3395 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3396 * path would need to manually save/restore vmcs01.GUEST_CR3. 3397 */ 3398 if (!enable_ept && !nested_early_check) 3399 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3400 3401 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3402 3403 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3404 3405 if (from_vmentry) { 3406 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3407 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3408 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3409 } 3410 3411 if (nested_vmx_check_vmentry_hw(vcpu)) { 3412 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3413 return NVMX_VMENTRY_VMFAIL; 3414 } 3415 3416 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3417 &entry_failure_code)) { 3418 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3419 vmcs12->exit_qualification = entry_failure_code; 3420 goto vmentry_fail_vmexit; 3421 } 3422 } 3423 3424 enter_guest_mode(vcpu); 3425 3426 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3427 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3428 vmcs12->exit_qualification = entry_failure_code; 3429 goto vmentry_fail_vmexit_guest_mode; 3430 } 3431 3432 if (from_vmentry) { 3433 failed_index = nested_vmx_load_msr(vcpu, 3434 vmcs12->vm_entry_msr_load_addr, 3435 vmcs12->vm_entry_msr_load_count); 3436 if (failed_index) { 3437 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3438 vmcs12->exit_qualification = failed_index; 3439 goto vmentry_fail_vmexit_guest_mode; 3440 } 3441 } else { 3442 /* 3443 * The MMU is not initialized to point at the right entities yet and 3444 * "get pages" would need to read data from the guest (i.e. we will 3445 * need to perform gpa to hpa translation). Request a call 3446 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3447 * have already been set at vmentry time and should not be reset. 3448 */ 3449 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3450 } 3451 3452 /* 3453 * If L1 had a pending IRQ/NMI until it executed 3454 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3455 * disallowed (e.g. interrupts disabled), L0 needs to 3456 * evaluate if this pending event should cause an exit from L2 3457 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3458 * intercept EXTERNAL_INTERRUPT). 3459 * 3460 * Usually this would be handled by the processor noticing an 3461 * IRQ/NMI window request, or checking RVI during evaluation of 3462 * pending virtual interrupts. However, this setting was done 3463 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3464 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3465 */ 3466 if (unlikely(evaluate_pending_interrupts)) 3467 kvm_make_request(KVM_REQ_EVENT, vcpu); 3468 3469 /* 3470 * Do not start the preemption timer hrtimer until after we know 3471 * we are successful, so that only nested_vmx_vmexit needs to cancel 3472 * the timer. 3473 */ 3474 vmx->nested.preemption_timer_expired = false; 3475 if (nested_cpu_has_preemption_timer(vmcs12)) { 3476 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3477 vmx_start_preemption_timer(vcpu, timer_value); 3478 } 3479 3480 /* 3481 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3482 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3483 * returned as far as L1 is concerned. It will only return (and set 3484 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3485 */ 3486 return NVMX_VMENTRY_SUCCESS; 3487 3488 /* 3489 * A failed consistency check that leads to a VMExit during L1's 3490 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3491 * 26.7 "VM-entry failures during or after loading guest state". 3492 */ 3493 vmentry_fail_vmexit_guest_mode: 3494 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3495 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3496 leave_guest_mode(vcpu); 3497 3498 vmentry_fail_vmexit: 3499 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3500 3501 if (!from_vmentry) 3502 return NVMX_VMENTRY_VMEXIT; 3503 3504 load_vmcs12_host_state(vcpu, vmcs12); 3505 vmcs12->vm_exit_reason = exit_reason.full; 3506 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 3507 vmx->nested.need_vmcs12_to_shadow_sync = true; 3508 return NVMX_VMENTRY_VMEXIT; 3509 } 3510 3511 /* 3512 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3513 * for running an L2 nested guest. 3514 */ 3515 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3516 { 3517 struct vmcs12 *vmcs12; 3518 enum nvmx_vmentry_status status; 3519 struct vcpu_vmx *vmx = to_vmx(vcpu); 3520 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3521 enum nested_evmptrld_status evmptrld_status; 3522 3523 if (!nested_vmx_check_permission(vcpu)) 3524 return 1; 3525 3526 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3527 if (evmptrld_status == EVMPTRLD_ERROR) { 3528 kvm_queue_exception(vcpu, UD_VECTOR); 3529 return 1; 3530 } 3531 3532 kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 3533 3534 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3535 return nested_vmx_failInvalid(vcpu); 3536 3537 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && 3538 vmx->nested.current_vmptr == INVALID_GPA)) 3539 return nested_vmx_failInvalid(vcpu); 3540 3541 vmcs12 = get_vmcs12(vcpu); 3542 3543 /* 3544 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3545 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3546 * rather than RFLAGS.ZF, and no error number is stored to the 3547 * VM-instruction error field. 3548 */ 3549 if (CC(vmcs12->hdr.shadow_vmcs)) 3550 return nested_vmx_failInvalid(vcpu); 3551 3552 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 3553 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); 3554 /* Enlightened VMCS doesn't have launch state */ 3555 vmcs12->launch_state = !launch; 3556 } else if (enable_shadow_vmcs) { 3557 copy_shadow_to_vmcs12(vmx); 3558 } 3559 3560 /* 3561 * The nested entry process starts with enforcing various prerequisites 3562 * on vmcs12 as required by the Intel SDM, and act appropriately when 3563 * they fail: As the SDM explains, some conditions should cause the 3564 * instruction to fail, while others will cause the instruction to seem 3565 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3566 * To speed up the normal (success) code path, we should avoid checking 3567 * for misconfigurations which will anyway be caught by the processor 3568 * when using the merged vmcs02. 3569 */ 3570 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3571 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3572 3573 if (CC(vmcs12->launch_state == launch)) 3574 return nested_vmx_fail(vcpu, 3575 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3576 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3577 3578 if (nested_vmx_check_controls(vcpu, vmcs12)) 3579 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3580 3581 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3582 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3583 3584 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3585 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3586 3587 /* 3588 * We're finally done with prerequisite checking, and can start with 3589 * the nested entry. 3590 */ 3591 vmx->nested.nested_run_pending = 1; 3592 vmx->nested.has_preemption_timer_deadline = false; 3593 status = nested_vmx_enter_non_root_mode(vcpu, true); 3594 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3595 goto vmentry_failed; 3596 3597 /* Emulate processing of posted interrupts on VM-Enter. */ 3598 if (nested_cpu_has_posted_intr(vmcs12) && 3599 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3600 vmx->nested.pi_pending = true; 3601 kvm_make_request(KVM_REQ_EVENT, vcpu); 3602 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3603 } 3604 3605 /* Hide L1D cache contents from the nested guest. */ 3606 vmx->vcpu.arch.l1tf_flush_l1d = true; 3607 3608 /* 3609 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3610 * also be used as part of restoring nVMX state for 3611 * snapshot restore (migration). 3612 * 3613 * In this flow, it is assumed that vmcs12 cache was 3614 * transferred as part of captured nVMX state and should 3615 * therefore not be read from guest memory (which may not 3616 * exist on destination host yet). 3617 */ 3618 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3619 3620 switch (vmcs12->guest_activity_state) { 3621 case GUEST_ACTIVITY_HLT: 3622 /* 3623 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3624 * awakened by event injection or by an NMI-window VM-exit or 3625 * by an interrupt-window VM-exit, halt the vcpu. 3626 */ 3627 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3628 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3629 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3630 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3631 vmx->nested.nested_run_pending = 0; 3632 return kvm_emulate_halt_noskip(vcpu); 3633 } 3634 break; 3635 case GUEST_ACTIVITY_WAIT_SIPI: 3636 vmx->nested.nested_run_pending = 0; 3637 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3638 break; 3639 default: 3640 break; 3641 } 3642 3643 return 1; 3644 3645 vmentry_failed: 3646 vmx->nested.nested_run_pending = 0; 3647 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3648 return 0; 3649 if (status == NVMX_VMENTRY_VMEXIT) 3650 return 1; 3651 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3652 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3653 } 3654 3655 /* 3656 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3657 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3658 * This function returns the new value we should put in vmcs12.guest_cr0. 3659 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3660 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3661 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3662 * didn't trap the bit, because if L1 did, so would L0). 3663 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3664 * been modified by L2, and L1 knows it. So just leave the old value of 3665 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3666 * isn't relevant, because if L0 traps this bit it can set it to anything. 3667 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3668 * changed these bits, and therefore they need to be updated, but L0 3669 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3670 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3671 */ 3672 static inline unsigned long 3673 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3674 { 3675 return 3676 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3677 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3678 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3679 vcpu->arch.cr0_guest_owned_bits)); 3680 } 3681 3682 static inline unsigned long 3683 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3684 { 3685 return 3686 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3687 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3688 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3689 vcpu->arch.cr4_guest_owned_bits)); 3690 } 3691 3692 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3693 struct vmcs12 *vmcs12, 3694 u32 vm_exit_reason, u32 exit_intr_info) 3695 { 3696 u32 idt_vectoring; 3697 unsigned int nr; 3698 3699 /* 3700 * Per the SDM, VM-Exits due to double and triple faults are never 3701 * considered to occur during event delivery, even if the double/triple 3702 * fault is the result of an escalating vectoring issue. 3703 * 3704 * Note, the SDM qualifies the double fault behavior with "The original 3705 * event results in a double-fault exception". It's unclear why the 3706 * qualification exists since exits due to double fault can occur only 3707 * while vectoring a different exception (injected events are never 3708 * subject to interception), i.e. there's _always_ an original event. 3709 * 3710 * The SDM also uses NMI as a confusing example for the "original event 3711 * causes the VM exit directly" clause. NMI isn't special in any way, 3712 * the same rule applies to all events that cause an exit directly. 3713 * NMI is an odd choice for the example because NMIs can only occur on 3714 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3715 */ 3716 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3717 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3718 is_double_fault(exit_intr_info))) { 3719 vmcs12->idt_vectoring_info_field = 0; 3720 } else if (vcpu->arch.exception.injected) { 3721 nr = vcpu->arch.exception.nr; 3722 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3723 3724 if (kvm_exception_is_soft(nr)) { 3725 vmcs12->vm_exit_instruction_len = 3726 vcpu->arch.event_exit_inst_len; 3727 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3728 } else 3729 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3730 3731 if (vcpu->arch.exception.has_error_code) { 3732 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3733 vmcs12->idt_vectoring_error_code = 3734 vcpu->arch.exception.error_code; 3735 } 3736 3737 vmcs12->idt_vectoring_info_field = idt_vectoring; 3738 } else if (vcpu->arch.nmi_injected) { 3739 vmcs12->idt_vectoring_info_field = 3740 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3741 } else if (vcpu->arch.interrupt.injected) { 3742 nr = vcpu->arch.interrupt.nr; 3743 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3744 3745 if (vcpu->arch.interrupt.soft) { 3746 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3747 vmcs12->vm_entry_instruction_len = 3748 vcpu->arch.event_exit_inst_len; 3749 } else 3750 idt_vectoring |= INTR_TYPE_EXT_INTR; 3751 3752 vmcs12->idt_vectoring_info_field = idt_vectoring; 3753 } else { 3754 vmcs12->idt_vectoring_info_field = 0; 3755 } 3756 } 3757 3758 3759 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3760 { 3761 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3762 gfn_t gfn; 3763 3764 /* 3765 * Don't need to mark the APIC access page dirty; it is never 3766 * written to by the CPU during APIC virtualization. 3767 */ 3768 3769 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3770 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3771 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3772 } 3773 3774 if (nested_cpu_has_posted_intr(vmcs12)) { 3775 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3776 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3777 } 3778 } 3779 3780 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3781 { 3782 struct vcpu_vmx *vmx = to_vmx(vcpu); 3783 int max_irr; 3784 void *vapic_page; 3785 u16 status; 3786 3787 if (!vmx->nested.pi_pending) 3788 return 0; 3789 3790 if (!vmx->nested.pi_desc) 3791 goto mmio_needed; 3792 3793 vmx->nested.pi_pending = false; 3794 3795 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3796 return 0; 3797 3798 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3799 if (max_irr != 256) { 3800 vapic_page = vmx->nested.virtual_apic_map.hva; 3801 if (!vapic_page) 3802 goto mmio_needed; 3803 3804 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3805 vapic_page, &max_irr); 3806 status = vmcs_read16(GUEST_INTR_STATUS); 3807 if ((u8)max_irr > ((u8)status & 0xff)) { 3808 status &= ~0xff; 3809 status |= (u8)max_irr; 3810 vmcs_write16(GUEST_INTR_STATUS, status); 3811 } 3812 } 3813 3814 nested_mark_vmcs12_pages_dirty(vcpu); 3815 return 0; 3816 3817 mmio_needed: 3818 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3819 return -ENXIO; 3820 } 3821 3822 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3823 unsigned long exit_qual) 3824 { 3825 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3826 unsigned int nr = vcpu->arch.exception.nr; 3827 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3828 3829 if (vcpu->arch.exception.has_error_code) { 3830 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3831 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3832 } 3833 3834 if (kvm_exception_is_soft(nr)) 3835 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3836 else 3837 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3838 3839 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3840 vmx_get_nmi_mask(vcpu)) 3841 intr_info |= INTR_INFO_UNBLOCK_NMI; 3842 3843 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3844 } 3845 3846 /* 3847 * Returns true if a debug trap is pending delivery. 3848 * 3849 * In KVM, debug traps bear an exception payload. As such, the class of a #DB 3850 * exception may be inferred from the presence of an exception payload. 3851 */ 3852 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) 3853 { 3854 return vcpu->arch.exception.pending && 3855 vcpu->arch.exception.nr == DB_VECTOR && 3856 vcpu->arch.exception.payload; 3857 } 3858 3859 /* 3860 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3861 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3862 * represents these debug traps with a payload that is said to be compatible 3863 * with the 'pending debug exceptions' field, write the payload to the VMCS 3864 * field if a VM-exit is delivered before the debug trap. 3865 */ 3866 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3867 { 3868 if (vmx_pending_dbg_trap(vcpu)) 3869 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 3870 vcpu->arch.exception.payload); 3871 } 3872 3873 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3874 { 3875 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3876 to_vmx(vcpu)->nested.preemption_timer_expired; 3877 } 3878 3879 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 3880 { 3881 struct vcpu_vmx *vmx = to_vmx(vcpu); 3882 unsigned long exit_qual; 3883 bool block_nested_events = 3884 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3885 bool mtf_pending = vmx->nested.mtf_pending; 3886 struct kvm_lapic *apic = vcpu->arch.apic; 3887 3888 /* 3889 * Clear the MTF state. If a higher priority VM-exit is delivered first, 3890 * this state is discarded. 3891 */ 3892 if (!block_nested_events) 3893 vmx->nested.mtf_pending = false; 3894 3895 if (lapic_in_kernel(vcpu) && 3896 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3897 if (block_nested_events) 3898 return -EBUSY; 3899 nested_vmx_update_pending_dbg(vcpu); 3900 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3901 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 3902 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3903 return 0; 3904 } 3905 3906 if (lapic_in_kernel(vcpu) && 3907 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 3908 if (block_nested_events) 3909 return -EBUSY; 3910 3911 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 3912 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 3913 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 3914 apic->sipi_vector & 0xFFUL); 3915 return 0; 3916 } 3917 3918 /* 3919 * Process any exceptions that are not debug traps before MTF. 3920 * 3921 * Note that only a pending nested run can block a pending exception. 3922 * Otherwise an injected NMI/interrupt should either be 3923 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, 3924 * while delivering the pending exception. 3925 */ 3926 3927 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { 3928 if (vmx->nested.nested_run_pending) 3929 return -EBUSY; 3930 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3931 goto no_vmexit; 3932 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3933 return 0; 3934 } 3935 3936 if (mtf_pending) { 3937 if (block_nested_events) 3938 return -EBUSY; 3939 nested_vmx_update_pending_dbg(vcpu); 3940 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 3941 return 0; 3942 } 3943 3944 if (vcpu->arch.exception.pending) { 3945 if (vmx->nested.nested_run_pending) 3946 return -EBUSY; 3947 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3948 goto no_vmexit; 3949 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3950 return 0; 3951 } 3952 3953 if (nested_vmx_preemption_timer_pending(vcpu)) { 3954 if (block_nested_events) 3955 return -EBUSY; 3956 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3957 return 0; 3958 } 3959 3960 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 3961 if (block_nested_events) 3962 return -EBUSY; 3963 goto no_vmexit; 3964 } 3965 3966 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 3967 if (block_nested_events) 3968 return -EBUSY; 3969 if (!nested_exit_on_nmi(vcpu)) 3970 goto no_vmexit; 3971 3972 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3973 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3974 INTR_INFO_VALID_MASK, 0); 3975 /* 3976 * The NMI-triggered VM exit counts as injection: 3977 * clear this one and block further NMIs. 3978 */ 3979 vcpu->arch.nmi_pending = 0; 3980 vmx_set_nmi_mask(vcpu, true); 3981 return 0; 3982 } 3983 3984 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 3985 if (block_nested_events) 3986 return -EBUSY; 3987 if (!nested_exit_on_intr(vcpu)) 3988 goto no_vmexit; 3989 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3990 return 0; 3991 } 3992 3993 no_vmexit: 3994 return vmx_complete_nested_posted_interrupt(vcpu); 3995 } 3996 3997 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3998 { 3999 ktime_t remaining = 4000 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4001 u64 value; 4002 4003 if (ktime_to_ns(remaining) <= 0) 4004 return 0; 4005 4006 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4007 do_div(value, 1000000); 4008 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4009 } 4010 4011 static bool is_vmcs12_ext_field(unsigned long field) 4012 { 4013 switch (field) { 4014 case GUEST_ES_SELECTOR: 4015 case GUEST_CS_SELECTOR: 4016 case GUEST_SS_SELECTOR: 4017 case GUEST_DS_SELECTOR: 4018 case GUEST_FS_SELECTOR: 4019 case GUEST_GS_SELECTOR: 4020 case GUEST_LDTR_SELECTOR: 4021 case GUEST_TR_SELECTOR: 4022 case GUEST_ES_LIMIT: 4023 case GUEST_CS_LIMIT: 4024 case GUEST_SS_LIMIT: 4025 case GUEST_DS_LIMIT: 4026 case GUEST_FS_LIMIT: 4027 case GUEST_GS_LIMIT: 4028 case GUEST_LDTR_LIMIT: 4029 case GUEST_TR_LIMIT: 4030 case GUEST_GDTR_LIMIT: 4031 case GUEST_IDTR_LIMIT: 4032 case GUEST_ES_AR_BYTES: 4033 case GUEST_DS_AR_BYTES: 4034 case GUEST_FS_AR_BYTES: 4035 case GUEST_GS_AR_BYTES: 4036 case GUEST_LDTR_AR_BYTES: 4037 case GUEST_TR_AR_BYTES: 4038 case GUEST_ES_BASE: 4039 case GUEST_CS_BASE: 4040 case GUEST_SS_BASE: 4041 case GUEST_DS_BASE: 4042 case GUEST_FS_BASE: 4043 case GUEST_GS_BASE: 4044 case GUEST_LDTR_BASE: 4045 case GUEST_TR_BASE: 4046 case GUEST_GDTR_BASE: 4047 case GUEST_IDTR_BASE: 4048 case GUEST_PENDING_DBG_EXCEPTIONS: 4049 case GUEST_BNDCFGS: 4050 return true; 4051 default: 4052 break; 4053 } 4054 4055 return false; 4056 } 4057 4058 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4059 struct vmcs12 *vmcs12) 4060 { 4061 struct vcpu_vmx *vmx = to_vmx(vcpu); 4062 4063 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4064 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4065 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4066 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4067 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4068 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4069 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4070 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4071 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4072 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4073 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4074 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4075 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4076 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4077 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4078 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4079 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4080 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4081 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4082 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4083 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4084 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4085 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4086 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4087 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4088 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4089 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4090 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4091 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4092 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4093 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4094 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4095 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4096 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4097 vmcs12->guest_pending_dbg_exceptions = 4098 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4099 4100 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4101 } 4102 4103 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4104 struct vmcs12 *vmcs12) 4105 { 4106 struct vcpu_vmx *vmx = to_vmx(vcpu); 4107 int cpu; 4108 4109 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4110 return; 4111 4112 4113 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4114 4115 cpu = get_cpu(); 4116 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4117 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4118 4119 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4120 4121 vmx->loaded_vmcs = &vmx->vmcs01; 4122 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4123 put_cpu(); 4124 } 4125 4126 /* 4127 * Update the guest state fields of vmcs12 to reflect changes that 4128 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4129 * VM-entry controls is also updated, since this is really a guest 4130 * state bit.) 4131 */ 4132 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4133 { 4134 struct vcpu_vmx *vmx = to_vmx(vcpu); 4135 4136 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 4137 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4138 4139 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4140 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); 4141 4142 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4143 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4144 4145 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4146 vmcs12->guest_rip = kvm_rip_read(vcpu); 4147 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4148 4149 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4150 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4151 4152 vmcs12->guest_interruptibility_info = 4153 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4154 4155 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4156 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4157 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4158 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4159 else 4160 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4161 4162 if (nested_cpu_has_preemption_timer(vmcs12) && 4163 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4164 !vmx->nested.nested_run_pending) 4165 vmcs12->vmx_preemption_timer_value = 4166 vmx_get_preemption_timer_value(vcpu); 4167 4168 /* 4169 * In some cases (usually, nested EPT), L2 is allowed to change its 4170 * own CR3 without exiting. If it has changed it, we must keep it. 4171 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4172 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4173 * 4174 * Additionally, restore L2's PDPTR to vmcs12. 4175 */ 4176 if (enable_ept) { 4177 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4178 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4179 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4180 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4181 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4182 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4183 } 4184 } 4185 4186 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4187 4188 if (nested_cpu_has_vid(vmcs12)) 4189 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4190 4191 vmcs12->vm_entry_controls = 4192 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4193 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4194 4195 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4196 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4197 4198 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4199 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4200 } 4201 4202 /* 4203 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4204 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4205 * and this function updates it to reflect the changes to the guest state while 4206 * L2 was running (and perhaps made some exits which were handled directly by L0 4207 * without going back to L1), and to reflect the exit reason. 4208 * Note that we do not have to copy here all VMCS fields, just those that 4209 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4210 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4211 * which already writes to vmcs12 directly. 4212 */ 4213 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4214 u32 vm_exit_reason, u32 exit_intr_info, 4215 unsigned long exit_qualification) 4216 { 4217 /* update exit information fields: */ 4218 vmcs12->vm_exit_reason = vm_exit_reason; 4219 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4220 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4221 vmcs12->exit_qualification = exit_qualification; 4222 4223 /* 4224 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4225 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4226 * exit info fields are unmodified. 4227 */ 4228 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4229 vmcs12->launch_state = 1; 4230 4231 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4232 * instead of reading the real value. */ 4233 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4234 4235 /* 4236 * Transfer the event that L0 or L1 may wanted to inject into 4237 * L2 to IDT_VECTORING_INFO_FIELD. 4238 */ 4239 vmcs12_save_pending_event(vcpu, vmcs12, 4240 vm_exit_reason, exit_intr_info); 4241 4242 vmcs12->vm_exit_intr_info = exit_intr_info; 4243 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4244 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4245 4246 /* 4247 * According to spec, there's no need to store the guest's 4248 * MSRs if the exit is due to a VM-entry failure that occurs 4249 * during or after loading the guest state. Since this exit 4250 * does not fall in that category, we need to save the MSRs. 4251 */ 4252 if (nested_vmx_store_msr(vcpu, 4253 vmcs12->vm_exit_msr_store_addr, 4254 vmcs12->vm_exit_msr_store_count)) 4255 nested_vmx_abort(vcpu, 4256 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4257 } 4258 4259 /* 4260 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 4261 * preserved above and would only end up incorrectly in L1. 4262 */ 4263 vcpu->arch.nmi_injected = false; 4264 kvm_clear_exception_queue(vcpu); 4265 kvm_clear_interrupt_queue(vcpu); 4266 } 4267 4268 /* 4269 * A part of what we need to when the nested L2 guest exits and we want to 4270 * run its L1 parent, is to reset L1's guest state to the host state specified 4271 * in vmcs12. 4272 * This function is to be called not only on normal nested exit, but also on 4273 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4274 * Failures During or After Loading Guest State"). 4275 * This function should be called when the active VMCS is L1's (vmcs01). 4276 */ 4277 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4278 struct vmcs12 *vmcs12) 4279 { 4280 enum vm_entry_failure_code ignored; 4281 struct kvm_segment seg; 4282 4283 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4284 vcpu->arch.efer = vmcs12->host_ia32_efer; 4285 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4286 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4287 else 4288 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4289 vmx_set_efer(vcpu, vcpu->arch.efer); 4290 4291 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4292 kvm_rip_write(vcpu, vmcs12->host_rip); 4293 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4294 vmx_set_interrupt_shadow(vcpu, 0); 4295 4296 /* 4297 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4298 * actually changed, because vmx_set_cr0 refers to efer set above. 4299 * 4300 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4301 * (KVM doesn't change it); 4302 */ 4303 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4304 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4305 4306 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4307 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4308 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4309 4310 nested_ept_uninit_mmu_context(vcpu); 4311 4312 /* 4313 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4314 * couldn't have changed. 4315 */ 4316 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4317 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4318 4319 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4320 4321 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4322 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4323 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4324 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4325 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4326 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4327 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4328 4329 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4330 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4331 vmcs_write64(GUEST_BNDCFGS, 0); 4332 4333 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4334 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4335 vcpu->arch.pat = vmcs12->host_ia32_pat; 4336 } 4337 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4338 intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4339 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4340 vmcs12->host_ia32_perf_global_ctrl)); 4341 4342 /* Set L1 segment info according to Intel SDM 4343 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4344 seg = (struct kvm_segment) { 4345 .base = 0, 4346 .limit = 0xFFFFFFFF, 4347 .selector = vmcs12->host_cs_selector, 4348 .type = 11, 4349 .present = 1, 4350 .s = 1, 4351 .g = 1 4352 }; 4353 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4354 seg.l = 1; 4355 else 4356 seg.db = 1; 4357 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4358 seg = (struct kvm_segment) { 4359 .base = 0, 4360 .limit = 0xFFFFFFFF, 4361 .type = 3, 4362 .present = 1, 4363 .s = 1, 4364 .db = 1, 4365 .g = 1 4366 }; 4367 seg.selector = vmcs12->host_ds_selector; 4368 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4369 seg.selector = vmcs12->host_es_selector; 4370 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4371 seg.selector = vmcs12->host_ss_selector; 4372 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4373 seg.selector = vmcs12->host_fs_selector; 4374 seg.base = vmcs12->host_fs_base; 4375 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4376 seg.selector = vmcs12->host_gs_selector; 4377 seg.base = vmcs12->host_gs_base; 4378 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4379 seg = (struct kvm_segment) { 4380 .base = vmcs12->host_tr_base, 4381 .limit = 0x67, 4382 .selector = vmcs12->host_tr_selector, 4383 .type = 11, 4384 .present = 1 4385 }; 4386 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4387 4388 memset(&seg, 0, sizeof(seg)); 4389 seg.unusable = 1; 4390 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4391 4392 kvm_set_dr(vcpu, 7, 0x400); 4393 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4394 4395 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4396 vmcs12->vm_exit_msr_load_count)) 4397 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4398 4399 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4400 } 4401 4402 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4403 { 4404 struct vmx_uret_msr *efer_msr; 4405 unsigned int i; 4406 4407 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4408 return vmcs_read64(GUEST_IA32_EFER); 4409 4410 if (cpu_has_load_ia32_efer()) 4411 return host_efer; 4412 4413 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4414 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4415 return vmx->msr_autoload.guest.val[i].value; 4416 } 4417 4418 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4419 if (efer_msr) 4420 return efer_msr->data; 4421 4422 return host_efer; 4423 } 4424 4425 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4426 { 4427 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4428 struct vcpu_vmx *vmx = to_vmx(vcpu); 4429 struct vmx_msr_entry g, h; 4430 gpa_t gpa; 4431 u32 i, j; 4432 4433 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4434 4435 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4436 /* 4437 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4438 * as vmcs01.GUEST_DR7 contains a userspace defined value 4439 * and vcpu->arch.dr7 is not squirreled away before the 4440 * nested VMENTER (not worth adding a variable in nested_vmx). 4441 */ 4442 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4443 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4444 else 4445 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4446 } 4447 4448 /* 4449 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4450 * handle a variety of side effects to KVM's software model. 4451 */ 4452 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4453 4454 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4455 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4456 4457 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4458 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4459 4460 nested_ept_uninit_mmu_context(vcpu); 4461 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4462 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4463 4464 /* 4465 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4466 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4467 * VMFail, like everything else we just need to ensure our 4468 * software model is up-to-date. 4469 */ 4470 if (enable_ept && is_pae_paging(vcpu)) 4471 ept_save_pdptrs(vcpu); 4472 4473 kvm_mmu_reset_context(vcpu); 4474 4475 /* 4476 * This nasty bit of open coding is a compromise between blindly 4477 * loading L1's MSRs using the exit load lists (incorrect emulation 4478 * of VMFail), leaving the nested VM's MSRs in the software model 4479 * (incorrect behavior) and snapshotting the modified MSRs (too 4480 * expensive since the lists are unbound by hardware). For each 4481 * MSR that was (prematurely) loaded from the nested VMEntry load 4482 * list, reload it from the exit load list if it exists and differs 4483 * from the guest value. The intent is to stuff host state as 4484 * silently as possible, not to fully process the exit load list. 4485 */ 4486 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4487 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4488 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4489 pr_debug_ratelimited( 4490 "%s read MSR index failed (%u, 0x%08llx)\n", 4491 __func__, i, gpa); 4492 goto vmabort; 4493 } 4494 4495 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4496 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4497 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4498 pr_debug_ratelimited( 4499 "%s read MSR failed (%u, 0x%08llx)\n", 4500 __func__, j, gpa); 4501 goto vmabort; 4502 } 4503 if (h.index != g.index) 4504 continue; 4505 if (h.value == g.value) 4506 break; 4507 4508 if (nested_vmx_load_msr_check(vcpu, &h)) { 4509 pr_debug_ratelimited( 4510 "%s check failed (%u, 0x%x, 0x%x)\n", 4511 __func__, j, h.index, h.reserved); 4512 goto vmabort; 4513 } 4514 4515 if (kvm_set_msr(vcpu, h.index, h.value)) { 4516 pr_debug_ratelimited( 4517 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4518 __func__, j, h.index, h.value); 4519 goto vmabort; 4520 } 4521 } 4522 } 4523 4524 return; 4525 4526 vmabort: 4527 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4528 } 4529 4530 /* 4531 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4532 * and modify vmcs12 to make it see what it would expect to see there if 4533 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4534 */ 4535 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4536 u32 exit_intr_info, unsigned long exit_qualification) 4537 { 4538 struct vcpu_vmx *vmx = to_vmx(vcpu); 4539 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4540 4541 /* trying to cancel vmlaunch/vmresume is a bug */ 4542 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4543 4544 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4545 /* 4546 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4547 * Enlightened VMCS after migration and we still need to 4548 * do that when something is forcing L2->L1 exit prior to 4549 * the first L2 run. 4550 */ 4551 (void)nested_get_evmcs_page(vcpu); 4552 } 4553 4554 /* Service pending TLB flush requests for L2 before switching to L1. */ 4555 kvm_service_local_tlb_flush_requests(vcpu); 4556 4557 /* 4558 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4559 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4560 * up-to-date before switching to L1. 4561 */ 4562 if (enable_ept && is_pae_paging(vcpu)) 4563 vmx_ept_load_pdptrs(vcpu); 4564 4565 leave_guest_mode(vcpu); 4566 4567 if (nested_cpu_has_preemption_timer(vmcs12)) 4568 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4569 4570 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4571 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4572 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4573 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4574 } 4575 4576 if (likely(!vmx->fail)) { 4577 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4578 4579 if (vm_exit_reason != -1) 4580 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4581 exit_intr_info, exit_qualification); 4582 4583 /* 4584 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4585 * also be used to capture vmcs12 cache as part of 4586 * capturing nVMX state for snapshot (migration). 4587 * 4588 * Otherwise, this flush will dirty guest memory at a 4589 * point it is already assumed by user-space to be 4590 * immutable. 4591 */ 4592 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4593 } else { 4594 /* 4595 * The only expected VM-instruction error is "VM entry with 4596 * invalid control field(s)." Anything else indicates a 4597 * problem with L0. And we should never get here with a 4598 * VMFail of any type if early consistency checks are enabled. 4599 */ 4600 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4601 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4602 WARN_ON_ONCE(nested_early_check); 4603 } 4604 4605 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4606 4607 /* Update any VMCS fields that might have changed while L2 ran */ 4608 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4609 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4610 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4611 if (kvm_caps.has_tsc_control) 4612 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4613 4614 if (vmx->nested.l1_tpr_threshold != -1) 4615 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4616 4617 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4618 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4619 vmx_set_virtual_apic_mode(vcpu); 4620 } 4621 4622 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4623 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4624 vmx_update_cpu_dirty_logging(vcpu); 4625 } 4626 4627 /* Unpin physical memory we referred to in vmcs02 */ 4628 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 4629 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4630 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4631 vmx->nested.pi_desc = NULL; 4632 4633 if (vmx->nested.reload_vmcs01_apic_access_page) { 4634 vmx->nested.reload_vmcs01_apic_access_page = false; 4635 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4636 } 4637 4638 if (vmx->nested.update_vmcs01_apicv_status) { 4639 vmx->nested.update_vmcs01_apicv_status = false; 4640 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 4641 } 4642 4643 if ((vm_exit_reason != -1) && 4644 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) 4645 vmx->nested.need_vmcs12_to_shadow_sync = true; 4646 4647 /* in case we halted in L2 */ 4648 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4649 4650 if (likely(!vmx->fail)) { 4651 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4652 nested_exit_intr_ack_set(vcpu)) { 4653 int irq = kvm_cpu_get_interrupt(vcpu); 4654 WARN_ON(irq < 0); 4655 vmcs12->vm_exit_intr_info = irq | 4656 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4657 } 4658 4659 if (vm_exit_reason != -1) 4660 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4661 vmcs12->exit_qualification, 4662 vmcs12->idt_vectoring_info_field, 4663 vmcs12->vm_exit_intr_info, 4664 vmcs12->vm_exit_intr_error_code, 4665 KVM_ISA_VMX); 4666 4667 load_vmcs12_host_state(vcpu, vmcs12); 4668 4669 return; 4670 } 4671 4672 /* 4673 * After an early L2 VM-entry failure, we're now back 4674 * in L1 which thinks it just finished a VMLAUNCH or 4675 * VMRESUME instruction, so we need to set the failure 4676 * flag and the VM-instruction error field of the VMCS 4677 * accordingly, and skip the emulated instruction. 4678 */ 4679 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4680 4681 /* 4682 * Restore L1's host state to KVM's software model. We're here 4683 * because a consistency check was caught by hardware, which 4684 * means some amount of guest state has been propagated to KVM's 4685 * model and needs to be unwound to the host's state. 4686 */ 4687 nested_vmx_restore_host_state(vcpu); 4688 4689 vmx->fail = 0; 4690 } 4691 4692 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4693 { 4694 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4695 } 4696 4697 /* 4698 * Decode the memory-address operand of a vmx instruction, as recorded on an 4699 * exit caused by such an instruction (run by a guest hypervisor). 4700 * On success, returns 0. When the operand is invalid, returns 1 and throws 4701 * #UD, #GP, or #SS. 4702 */ 4703 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4704 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4705 { 4706 gva_t off; 4707 bool exn; 4708 struct kvm_segment s; 4709 4710 /* 4711 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4712 * Execution", on an exit, vmx_instruction_info holds most of the 4713 * addressing components of the operand. Only the displacement part 4714 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4715 * For how an actual address is calculated from all these components, 4716 * refer to Vol. 1, "Operand Addressing". 4717 */ 4718 int scaling = vmx_instruction_info & 3; 4719 int addr_size = (vmx_instruction_info >> 7) & 7; 4720 bool is_reg = vmx_instruction_info & (1u << 10); 4721 int seg_reg = (vmx_instruction_info >> 15) & 7; 4722 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4723 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4724 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4725 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4726 4727 if (is_reg) { 4728 kvm_queue_exception(vcpu, UD_VECTOR); 4729 return 1; 4730 } 4731 4732 /* Addr = segment_base + offset */ 4733 /* offset = base + [index * scale] + displacement */ 4734 off = exit_qualification; /* holds the displacement */ 4735 if (addr_size == 1) 4736 off = (gva_t)sign_extend64(off, 31); 4737 else if (addr_size == 0) 4738 off = (gva_t)sign_extend64(off, 15); 4739 if (base_is_valid) 4740 off += kvm_register_read(vcpu, base_reg); 4741 if (index_is_valid) 4742 off += kvm_register_read(vcpu, index_reg) << scaling; 4743 vmx_get_segment(vcpu, &s, seg_reg); 4744 4745 /* 4746 * The effective address, i.e. @off, of a memory operand is truncated 4747 * based on the address size of the instruction. Note that this is 4748 * the *effective address*, i.e. the address prior to accounting for 4749 * the segment's base. 4750 */ 4751 if (addr_size == 1) /* 32 bit */ 4752 off &= 0xffffffff; 4753 else if (addr_size == 0) /* 16 bit */ 4754 off &= 0xffff; 4755 4756 /* Checks for #GP/#SS exceptions. */ 4757 exn = false; 4758 if (is_long_mode(vcpu)) { 4759 /* 4760 * The virtual/linear address is never truncated in 64-bit 4761 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4762 * address when using FS/GS with a non-zero base. 4763 */ 4764 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4765 *ret = s.base + off; 4766 else 4767 *ret = off; 4768 4769 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4770 * non-canonical form. This is the only check on the memory 4771 * destination for long mode! 4772 */ 4773 exn = is_noncanonical_address(*ret, vcpu); 4774 } else { 4775 /* 4776 * When not in long mode, the virtual/linear address is 4777 * unconditionally truncated to 32 bits regardless of the 4778 * address size. 4779 */ 4780 *ret = (s.base + off) & 0xffffffff; 4781 4782 /* Protected mode: apply checks for segment validity in the 4783 * following order: 4784 * - segment type check (#GP(0) may be thrown) 4785 * - usability check (#GP(0)/#SS(0)) 4786 * - limit check (#GP(0)/#SS(0)) 4787 */ 4788 if (wr) 4789 /* #GP(0) if the destination operand is located in a 4790 * read-only data segment or any code segment. 4791 */ 4792 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4793 else 4794 /* #GP(0) if the source operand is located in an 4795 * execute-only code segment 4796 */ 4797 exn = ((s.type & 0xa) == 8); 4798 if (exn) { 4799 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4800 return 1; 4801 } 4802 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4803 */ 4804 exn = (s.unusable != 0); 4805 4806 /* 4807 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4808 * outside the segment limit. All CPUs that support VMX ignore 4809 * limit checks for flat segments, i.e. segments with base==0, 4810 * limit==0xffffffff and of type expand-up data or code. 4811 */ 4812 if (!(s.base == 0 && s.limit == 0xffffffff && 4813 ((s.type & 8) || !(s.type & 4)))) 4814 exn = exn || ((u64)off + len - 1 > s.limit); 4815 } 4816 if (exn) { 4817 kvm_queue_exception_e(vcpu, 4818 seg_reg == VCPU_SREG_SS ? 4819 SS_VECTOR : GP_VECTOR, 4820 0); 4821 return 1; 4822 } 4823 4824 return 0; 4825 } 4826 4827 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 4828 int *ret) 4829 { 4830 gva_t gva; 4831 struct x86_exception e; 4832 int r; 4833 4834 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 4835 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4836 sizeof(*vmpointer), &gva)) { 4837 *ret = 1; 4838 return -EINVAL; 4839 } 4840 4841 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 4842 if (r != X86EMUL_CONTINUE) { 4843 *ret = kvm_handle_memory_failure(vcpu, r, &e); 4844 return -EINVAL; 4845 } 4846 4847 return 0; 4848 } 4849 4850 /* 4851 * Allocate a shadow VMCS and associate it with the currently loaded 4852 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4853 * VMCS is also VMCLEARed, so that it is ready for use. 4854 */ 4855 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4856 { 4857 struct vcpu_vmx *vmx = to_vmx(vcpu); 4858 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4859 4860 /* 4861 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 4862 * when L1 executes VMXOFF or the vCPU is forced out of nested 4863 * operation. VMXON faults if the CPU is already post-VMXON, so it 4864 * should be impossible to already have an allocated shadow VMCS. KVM 4865 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 4866 * always be the loaded VMCS. 4867 */ 4868 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 4869 return loaded_vmcs->shadow_vmcs; 4870 4871 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4872 if (loaded_vmcs->shadow_vmcs) 4873 vmcs_clear(loaded_vmcs->shadow_vmcs); 4874 4875 return loaded_vmcs->shadow_vmcs; 4876 } 4877 4878 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4879 { 4880 struct vcpu_vmx *vmx = to_vmx(vcpu); 4881 int r; 4882 4883 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4884 if (r < 0) 4885 goto out_vmcs02; 4886 4887 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4888 if (!vmx->nested.cached_vmcs12) 4889 goto out_cached_vmcs12; 4890 4891 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 4892 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4893 if (!vmx->nested.cached_shadow_vmcs12) 4894 goto out_cached_shadow_vmcs12; 4895 4896 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4897 goto out_shadow_vmcs; 4898 4899 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4900 HRTIMER_MODE_ABS_PINNED); 4901 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4902 4903 vmx->nested.vpid02 = allocate_vpid(); 4904 4905 vmx->nested.vmcs02_initialized = false; 4906 vmx->nested.vmxon = true; 4907 4908 if (vmx_pt_mode_is_host_guest()) { 4909 vmx->pt_desc.guest.ctl = 0; 4910 pt_update_intercept_for_msr(vcpu); 4911 } 4912 4913 return 0; 4914 4915 out_shadow_vmcs: 4916 kfree(vmx->nested.cached_shadow_vmcs12); 4917 4918 out_cached_shadow_vmcs12: 4919 kfree(vmx->nested.cached_vmcs12); 4920 4921 out_cached_vmcs12: 4922 free_loaded_vmcs(&vmx->nested.vmcs02); 4923 4924 out_vmcs02: 4925 return -ENOMEM; 4926 } 4927 4928 /* Emulate the VMXON instruction. */ 4929 static int handle_vmxon(struct kvm_vcpu *vcpu) 4930 { 4931 int ret; 4932 gpa_t vmptr; 4933 uint32_t revision; 4934 struct vcpu_vmx *vmx = to_vmx(vcpu); 4935 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 4936 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 4937 4938 /* 4939 * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks 4940 * that have higher priority than VM-Exit (see Intel SDM's pseudocode 4941 * for VMXON), as KVM must load valid CR0/CR4 values into hardware while 4942 * running the guest, i.e. KVM needs to check the _guest_ values. 4943 * 4944 * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and 4945 * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real 4946 * Mode, but KVM will never take the guest out of those modes. 4947 */ 4948 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 4949 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 4950 kvm_queue_exception(vcpu, UD_VECTOR); 4951 return 1; 4952 } 4953 4954 /* 4955 * CPL=0 and all other checks that are lower priority than VM-Exit must 4956 * be checked manually. 4957 */ 4958 if (vmx_get_cpl(vcpu)) { 4959 kvm_inject_gp(vcpu, 0); 4960 return 1; 4961 } 4962 4963 if (vmx->nested.vmxon) 4964 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4965 4966 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4967 != VMXON_NEEDED_FEATURES) { 4968 kvm_inject_gp(vcpu, 0); 4969 return 1; 4970 } 4971 4972 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 4973 return ret; 4974 4975 /* 4976 * SDM 3: 24.11.5 4977 * The first 4 bytes of VMXON region contain the supported 4978 * VMCS revision identifier 4979 * 4980 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4981 * which replaces physical address width with 32 4982 */ 4983 if (!page_address_valid(vcpu, vmptr)) 4984 return nested_vmx_failInvalid(vcpu); 4985 4986 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4987 revision != VMCS12_REVISION) 4988 return nested_vmx_failInvalid(vcpu); 4989 4990 vmx->nested.vmxon_ptr = vmptr; 4991 ret = enter_vmx_operation(vcpu); 4992 if (ret) 4993 return ret; 4994 4995 return nested_vmx_succeed(vcpu); 4996 } 4997 4998 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4999 { 5000 struct vcpu_vmx *vmx = to_vmx(vcpu); 5001 5002 if (vmx->nested.current_vmptr == INVALID_GPA) 5003 return; 5004 5005 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5006 5007 if (enable_shadow_vmcs) { 5008 /* copy to memory all shadowed fields in case 5009 they were modified */ 5010 copy_shadow_to_vmcs12(vmx); 5011 vmx_disable_shadow_vmcs(vmx); 5012 } 5013 vmx->nested.posted_intr_nv = -1; 5014 5015 /* Flush VMCS12 to guest memory */ 5016 kvm_vcpu_write_guest_page(vcpu, 5017 vmx->nested.current_vmptr >> PAGE_SHIFT, 5018 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5019 5020 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5021 5022 vmx->nested.current_vmptr = INVALID_GPA; 5023 } 5024 5025 /* Emulate the VMXOFF instruction */ 5026 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5027 { 5028 if (!nested_vmx_check_permission(vcpu)) 5029 return 1; 5030 5031 free_nested(vcpu); 5032 5033 /* Process a latched INIT during time CPU was in VMX operation */ 5034 kvm_make_request(KVM_REQ_EVENT, vcpu); 5035 5036 return nested_vmx_succeed(vcpu); 5037 } 5038 5039 /* Emulate the VMCLEAR instruction */ 5040 static int handle_vmclear(struct kvm_vcpu *vcpu) 5041 { 5042 struct vcpu_vmx *vmx = to_vmx(vcpu); 5043 u32 zero = 0; 5044 gpa_t vmptr; 5045 u64 evmcs_gpa; 5046 int r; 5047 5048 if (!nested_vmx_check_permission(vcpu)) 5049 return 1; 5050 5051 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5052 return r; 5053 5054 if (!page_address_valid(vcpu, vmptr)) 5055 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5056 5057 if (vmptr == vmx->nested.vmxon_ptr) 5058 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5059 5060 /* 5061 * When Enlightened VMEntry is enabled on the calling CPU we treat 5062 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 5063 * way to distinguish it from VMCS12) and we must not corrupt it by 5064 * writing to the non-existent 'launch_state' field. The area doesn't 5065 * have to be the currently active EVMCS on the calling CPU and there's 5066 * nothing KVM has to do to transition it from 'active' to 'non-active' 5067 * state. It is possible that the area will stay mapped as 5068 * vmx->nested.hv_evmcs but this shouldn't be a problem. 5069 */ 5070 if (likely(!vmx->nested.enlightened_vmcs_enabled || 5071 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 5072 if (vmptr == vmx->nested.current_vmptr) 5073 nested_release_vmcs12(vcpu); 5074 5075 kvm_vcpu_write_guest(vcpu, 5076 vmptr + offsetof(struct vmcs12, 5077 launch_state), 5078 &zero, sizeof(zero)); 5079 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { 5080 nested_release_evmcs(vcpu); 5081 } 5082 5083 return nested_vmx_succeed(vcpu); 5084 } 5085 5086 /* Emulate the VMLAUNCH instruction */ 5087 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5088 { 5089 return nested_vmx_run(vcpu, true); 5090 } 5091 5092 /* Emulate the VMRESUME instruction */ 5093 static int handle_vmresume(struct kvm_vcpu *vcpu) 5094 { 5095 5096 return nested_vmx_run(vcpu, false); 5097 } 5098 5099 static int handle_vmread(struct kvm_vcpu *vcpu) 5100 { 5101 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5102 : get_vmcs12(vcpu); 5103 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5104 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5105 struct vcpu_vmx *vmx = to_vmx(vcpu); 5106 struct x86_exception e; 5107 unsigned long field; 5108 u64 value; 5109 gva_t gva = 0; 5110 short offset; 5111 int len, r; 5112 5113 if (!nested_vmx_check_permission(vcpu)) 5114 return 1; 5115 5116 /* Decode instruction info and find the field to read */ 5117 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5118 5119 if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 5120 /* 5121 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5122 * any VMREAD sets the ALU flags for VMfailInvalid. 5123 */ 5124 if (vmx->nested.current_vmptr == INVALID_GPA || 5125 (is_guest_mode(vcpu) && 5126 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5127 return nested_vmx_failInvalid(vcpu); 5128 5129 offset = get_vmcs12_field_offset(field); 5130 if (offset < 0) 5131 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5132 5133 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5134 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5135 5136 /* Read the field, zero-extended to a u64 value */ 5137 value = vmcs12_read_any(vmcs12, field, offset); 5138 } else { 5139 /* 5140 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5141 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5142 * unsupported. Unfortunately, certain versions of Windows 11 5143 * don't comply with this requirement which is not enforced in 5144 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5145 * workaround, as misbehaving guests will panic on VM-Fail. 5146 * Note, enlightened VMCS is incompatible with shadow VMCS so 5147 * all VMREADs from L2 should go to L1. 5148 */ 5149 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5150 return nested_vmx_failInvalid(vcpu); 5151 5152 offset = evmcs_field_offset(field, NULL); 5153 if (offset < 0) 5154 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5155 5156 /* Read the field, zero-extended to a u64 value */ 5157 value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); 5158 } 5159 5160 /* 5161 * Now copy part of this value to register or memory, as requested. 5162 * Note that the number of bits actually copied is 32 or 64 depending 5163 * on the guest's mode (32 or 64 bit), not on the given field's length. 5164 */ 5165 if (instr_info & BIT(10)) { 5166 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5167 } else { 5168 len = is_64_bit_mode(vcpu) ? 8 : 4; 5169 if (get_vmx_mem_address(vcpu, exit_qualification, 5170 instr_info, true, len, &gva)) 5171 return 1; 5172 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5173 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5174 if (r != X86EMUL_CONTINUE) 5175 return kvm_handle_memory_failure(vcpu, r, &e); 5176 } 5177 5178 return nested_vmx_succeed(vcpu); 5179 } 5180 5181 static bool is_shadow_field_rw(unsigned long field) 5182 { 5183 switch (field) { 5184 #define SHADOW_FIELD_RW(x, y) case x: 5185 #include "vmcs_shadow_fields.h" 5186 return true; 5187 default: 5188 break; 5189 } 5190 return false; 5191 } 5192 5193 static bool is_shadow_field_ro(unsigned long field) 5194 { 5195 switch (field) { 5196 #define SHADOW_FIELD_RO(x, y) case x: 5197 #include "vmcs_shadow_fields.h" 5198 return true; 5199 default: 5200 break; 5201 } 5202 return false; 5203 } 5204 5205 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5206 { 5207 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5208 : get_vmcs12(vcpu); 5209 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5210 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5211 struct vcpu_vmx *vmx = to_vmx(vcpu); 5212 struct x86_exception e; 5213 unsigned long field; 5214 short offset; 5215 gva_t gva; 5216 int len, r; 5217 5218 /* 5219 * The value to write might be 32 or 64 bits, depending on L1's long 5220 * mode, and eventually we need to write that into a field of several 5221 * possible lengths. The code below first zero-extends the value to 64 5222 * bit (value), and then copies only the appropriate number of 5223 * bits into the vmcs12 field. 5224 */ 5225 u64 value = 0; 5226 5227 if (!nested_vmx_check_permission(vcpu)) 5228 return 1; 5229 5230 /* 5231 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5232 * any VMWRITE sets the ALU flags for VMfailInvalid. 5233 */ 5234 if (vmx->nested.current_vmptr == INVALID_GPA || 5235 (is_guest_mode(vcpu) && 5236 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5237 return nested_vmx_failInvalid(vcpu); 5238 5239 if (instr_info & BIT(10)) 5240 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5241 else { 5242 len = is_64_bit_mode(vcpu) ? 8 : 4; 5243 if (get_vmx_mem_address(vcpu, exit_qualification, 5244 instr_info, false, len, &gva)) 5245 return 1; 5246 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5247 if (r != X86EMUL_CONTINUE) 5248 return kvm_handle_memory_failure(vcpu, r, &e); 5249 } 5250 5251 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5252 5253 offset = get_vmcs12_field_offset(field); 5254 if (offset < 0) 5255 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5256 5257 /* 5258 * If the vCPU supports "VMWRITE to any supported field in the 5259 * VMCS," then the "read-only" fields are actually read/write. 5260 */ 5261 if (vmcs_field_readonly(field) && 5262 !nested_cpu_has_vmwrite_any_field(vcpu)) 5263 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5264 5265 /* 5266 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5267 * vmcs12, else we may crush a field or consume a stale value. 5268 */ 5269 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5270 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5271 5272 /* 5273 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5274 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5275 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5276 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5277 * from L1 will return a different value than VMREAD from L2 (L1 sees 5278 * the stripped down value, L2 sees the full value as stored by KVM). 5279 */ 5280 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5281 value &= 0x1f0ff; 5282 5283 vmcs12_write_any(vmcs12, field, offset, value); 5284 5285 /* 5286 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5287 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5288 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5289 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5290 */ 5291 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5292 /* 5293 * L1 can read these fields without exiting, ensure the 5294 * shadow VMCS is up-to-date. 5295 */ 5296 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5297 preempt_disable(); 5298 vmcs_load(vmx->vmcs01.shadow_vmcs); 5299 5300 __vmcs_writel(field, value); 5301 5302 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5303 vmcs_load(vmx->loaded_vmcs->vmcs); 5304 preempt_enable(); 5305 } 5306 vmx->nested.dirty_vmcs12 = true; 5307 } 5308 5309 return nested_vmx_succeed(vcpu); 5310 } 5311 5312 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5313 { 5314 vmx->nested.current_vmptr = vmptr; 5315 if (enable_shadow_vmcs) { 5316 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5317 vmcs_write64(VMCS_LINK_POINTER, 5318 __pa(vmx->vmcs01.shadow_vmcs)); 5319 vmx->nested.need_vmcs12_to_shadow_sync = true; 5320 } 5321 vmx->nested.dirty_vmcs12 = true; 5322 vmx->nested.force_msr_bitmap_recalc = true; 5323 } 5324 5325 /* Emulate the VMPTRLD instruction */ 5326 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5327 { 5328 struct vcpu_vmx *vmx = to_vmx(vcpu); 5329 gpa_t vmptr; 5330 int r; 5331 5332 if (!nested_vmx_check_permission(vcpu)) 5333 return 1; 5334 5335 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5336 return r; 5337 5338 if (!page_address_valid(vcpu, vmptr)) 5339 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5340 5341 if (vmptr == vmx->nested.vmxon_ptr) 5342 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5343 5344 /* Forbid normal VMPTRLD if Enlightened version was used */ 5345 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 5346 return 1; 5347 5348 if (vmx->nested.current_vmptr != vmptr) { 5349 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5350 struct vmcs_hdr hdr; 5351 5352 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5353 /* 5354 * Reads from an unbacked page return all 1s, 5355 * which means that the 32 bits located at the 5356 * given physical address won't match the required 5357 * VMCS12_REVISION identifier. 5358 */ 5359 return nested_vmx_fail(vcpu, 5360 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5361 } 5362 5363 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5364 offsetof(struct vmcs12, hdr), 5365 sizeof(hdr))) { 5366 return nested_vmx_fail(vcpu, 5367 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5368 } 5369 5370 if (hdr.revision_id != VMCS12_REVISION || 5371 (hdr.shadow_vmcs && 5372 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5373 return nested_vmx_fail(vcpu, 5374 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5375 } 5376 5377 nested_release_vmcs12(vcpu); 5378 5379 /* 5380 * Load VMCS12 from guest memory since it is not already 5381 * cached. 5382 */ 5383 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5384 VMCS12_SIZE)) { 5385 return nested_vmx_fail(vcpu, 5386 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5387 } 5388 5389 set_current_vmptr(vmx, vmptr); 5390 } 5391 5392 return nested_vmx_succeed(vcpu); 5393 } 5394 5395 /* Emulate the VMPTRST instruction */ 5396 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5397 { 5398 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5399 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5400 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5401 struct x86_exception e; 5402 gva_t gva; 5403 int r; 5404 5405 if (!nested_vmx_check_permission(vcpu)) 5406 return 1; 5407 5408 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) 5409 return 1; 5410 5411 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5412 true, sizeof(gpa_t), &gva)) 5413 return 1; 5414 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5415 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5416 sizeof(gpa_t), &e); 5417 if (r != X86EMUL_CONTINUE) 5418 return kvm_handle_memory_failure(vcpu, r, &e); 5419 5420 return nested_vmx_succeed(vcpu); 5421 } 5422 5423 /* Emulate the INVEPT instruction */ 5424 static int handle_invept(struct kvm_vcpu *vcpu) 5425 { 5426 struct vcpu_vmx *vmx = to_vmx(vcpu); 5427 u32 vmx_instruction_info, types; 5428 unsigned long type, roots_to_free; 5429 struct kvm_mmu *mmu; 5430 gva_t gva; 5431 struct x86_exception e; 5432 struct { 5433 u64 eptp, gpa; 5434 } operand; 5435 int i, r, gpr_index; 5436 5437 if (!(vmx->nested.msrs.secondary_ctls_high & 5438 SECONDARY_EXEC_ENABLE_EPT) || 5439 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5440 kvm_queue_exception(vcpu, UD_VECTOR); 5441 return 1; 5442 } 5443 5444 if (!nested_vmx_check_permission(vcpu)) 5445 return 1; 5446 5447 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5448 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5449 type = kvm_register_read(vcpu, gpr_index); 5450 5451 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5452 5453 if (type >= 32 || !(types & (1 << type))) 5454 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5455 5456 /* According to the Intel VMX instruction reference, the memory 5457 * operand is read even if it isn't needed (e.g., for type==global) 5458 */ 5459 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5460 vmx_instruction_info, false, sizeof(operand), &gva)) 5461 return 1; 5462 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5463 if (r != X86EMUL_CONTINUE) 5464 return kvm_handle_memory_failure(vcpu, r, &e); 5465 5466 /* 5467 * Nested EPT roots are always held through guest_mmu, 5468 * not root_mmu. 5469 */ 5470 mmu = &vcpu->arch.guest_mmu; 5471 5472 switch (type) { 5473 case VMX_EPT_EXTENT_CONTEXT: 5474 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5475 return nested_vmx_fail(vcpu, 5476 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5477 5478 roots_to_free = 0; 5479 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5480 operand.eptp)) 5481 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5482 5483 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5484 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5485 mmu->prev_roots[i].pgd, 5486 operand.eptp)) 5487 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5488 } 5489 break; 5490 case VMX_EPT_EXTENT_GLOBAL: 5491 roots_to_free = KVM_MMU_ROOTS_ALL; 5492 break; 5493 default: 5494 BUG(); 5495 break; 5496 } 5497 5498 if (roots_to_free) 5499 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5500 5501 return nested_vmx_succeed(vcpu); 5502 } 5503 5504 static int handle_invvpid(struct kvm_vcpu *vcpu) 5505 { 5506 struct vcpu_vmx *vmx = to_vmx(vcpu); 5507 u32 vmx_instruction_info; 5508 unsigned long type, types; 5509 gva_t gva; 5510 struct x86_exception e; 5511 struct { 5512 u64 vpid; 5513 u64 gla; 5514 } operand; 5515 u16 vpid02; 5516 int r, gpr_index; 5517 5518 if (!(vmx->nested.msrs.secondary_ctls_high & 5519 SECONDARY_EXEC_ENABLE_VPID) || 5520 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5521 kvm_queue_exception(vcpu, UD_VECTOR); 5522 return 1; 5523 } 5524 5525 if (!nested_vmx_check_permission(vcpu)) 5526 return 1; 5527 5528 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5529 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5530 type = kvm_register_read(vcpu, gpr_index); 5531 5532 types = (vmx->nested.msrs.vpid_caps & 5533 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5534 5535 if (type >= 32 || !(types & (1 << type))) 5536 return nested_vmx_fail(vcpu, 5537 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5538 5539 /* according to the intel vmx instruction reference, the memory 5540 * operand is read even if it isn't needed (e.g., for type==global) 5541 */ 5542 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5543 vmx_instruction_info, false, sizeof(operand), &gva)) 5544 return 1; 5545 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5546 if (r != X86EMUL_CONTINUE) 5547 return kvm_handle_memory_failure(vcpu, r, &e); 5548 5549 if (operand.vpid >> 16) 5550 return nested_vmx_fail(vcpu, 5551 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5552 5553 vpid02 = nested_get_vpid02(vcpu); 5554 switch (type) { 5555 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5556 if (!operand.vpid || 5557 is_noncanonical_address(operand.gla, vcpu)) 5558 return nested_vmx_fail(vcpu, 5559 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5560 vpid_sync_vcpu_addr(vpid02, operand.gla); 5561 break; 5562 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5563 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5564 if (!operand.vpid) 5565 return nested_vmx_fail(vcpu, 5566 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5567 vpid_sync_context(vpid02); 5568 break; 5569 case VMX_VPID_EXTENT_ALL_CONTEXT: 5570 vpid_sync_context(vpid02); 5571 break; 5572 default: 5573 WARN_ON_ONCE(1); 5574 return kvm_skip_emulated_instruction(vcpu); 5575 } 5576 5577 /* 5578 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5579 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5580 * roots as VPIDs are not tracked in the MMU role. 5581 * 5582 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5583 * an MMU when EPT is disabled. 5584 * 5585 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5586 */ 5587 if (!enable_ept) 5588 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 5589 5590 return nested_vmx_succeed(vcpu); 5591 } 5592 5593 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5594 struct vmcs12 *vmcs12) 5595 { 5596 u32 index = kvm_rcx_read(vcpu); 5597 u64 new_eptp; 5598 5599 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5600 return 1; 5601 if (index >= VMFUNC_EPTP_ENTRIES) 5602 return 1; 5603 5604 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5605 &new_eptp, index * 8, 8)) 5606 return 1; 5607 5608 /* 5609 * If the (L2) guest does a vmfunc to the currently 5610 * active ept pointer, we don't have to do anything else 5611 */ 5612 if (vmcs12->ept_pointer != new_eptp) { 5613 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5614 return 1; 5615 5616 vmcs12->ept_pointer = new_eptp; 5617 nested_ept_new_eptp(vcpu); 5618 5619 if (!nested_cpu_has_vpid(vmcs12)) 5620 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5621 } 5622 5623 return 0; 5624 } 5625 5626 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5627 { 5628 struct vcpu_vmx *vmx = to_vmx(vcpu); 5629 struct vmcs12 *vmcs12; 5630 u32 function = kvm_rax_read(vcpu); 5631 5632 /* 5633 * VMFUNC is only supported for nested guests, but we always enable the 5634 * secondary control for simplicity; for non-nested mode, fake that we 5635 * didn't by injecting #UD. 5636 */ 5637 if (!is_guest_mode(vcpu)) { 5638 kvm_queue_exception(vcpu, UD_VECTOR); 5639 return 1; 5640 } 5641 5642 vmcs12 = get_vmcs12(vcpu); 5643 5644 /* 5645 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5646 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5647 */ 5648 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5649 kvm_queue_exception(vcpu, UD_VECTOR); 5650 return 1; 5651 } 5652 5653 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5654 goto fail; 5655 5656 switch (function) { 5657 case 0: 5658 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5659 goto fail; 5660 break; 5661 default: 5662 goto fail; 5663 } 5664 return kvm_skip_emulated_instruction(vcpu); 5665 5666 fail: 5667 /* 5668 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5669 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5670 * EXIT_REASON_VMFUNC as the exit reason. 5671 */ 5672 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5673 vmx_get_intr_info(vcpu), 5674 vmx_get_exit_qual(vcpu)); 5675 return 1; 5676 } 5677 5678 /* 5679 * Return true if an IO instruction with the specified port and size should cause 5680 * a VM-exit into L1. 5681 */ 5682 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5683 int size) 5684 { 5685 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5686 gpa_t bitmap, last_bitmap; 5687 u8 b; 5688 5689 last_bitmap = INVALID_GPA; 5690 b = -1; 5691 5692 while (size > 0) { 5693 if (port < 0x8000) 5694 bitmap = vmcs12->io_bitmap_a; 5695 else if (port < 0x10000) 5696 bitmap = vmcs12->io_bitmap_b; 5697 else 5698 return true; 5699 bitmap += (port & 0x7fff) / 8; 5700 5701 if (last_bitmap != bitmap) 5702 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5703 return true; 5704 if (b & (1 << (port & 7))) 5705 return true; 5706 5707 port++; 5708 size--; 5709 last_bitmap = bitmap; 5710 } 5711 5712 return false; 5713 } 5714 5715 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5716 struct vmcs12 *vmcs12) 5717 { 5718 unsigned long exit_qualification; 5719 unsigned short port; 5720 int size; 5721 5722 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5723 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5724 5725 exit_qualification = vmx_get_exit_qual(vcpu); 5726 5727 port = exit_qualification >> 16; 5728 size = (exit_qualification & 7) + 1; 5729 5730 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5731 } 5732 5733 /* 5734 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5735 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5736 * disinterest in the current event (read or write a specific MSR) by using an 5737 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5738 */ 5739 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5740 struct vmcs12 *vmcs12, 5741 union vmx_exit_reason exit_reason) 5742 { 5743 u32 msr_index = kvm_rcx_read(vcpu); 5744 gpa_t bitmap; 5745 5746 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5747 return true; 5748 5749 /* 5750 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5751 * for the four combinations of read/write and low/high MSR numbers. 5752 * First we need to figure out which of the four to use: 5753 */ 5754 bitmap = vmcs12->msr_bitmap; 5755 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5756 bitmap += 2048; 5757 if (msr_index >= 0xc0000000) { 5758 msr_index -= 0xc0000000; 5759 bitmap += 1024; 5760 } 5761 5762 /* Then read the msr_index'th bit from this bitmap: */ 5763 if (msr_index < 1024*8) { 5764 unsigned char b; 5765 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5766 return true; 5767 return 1 & (b >> (msr_index & 7)); 5768 } else 5769 return true; /* let L1 handle the wrong parameter */ 5770 } 5771 5772 /* 5773 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5774 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5775 * intercept (via guest_host_mask etc.) the current event. 5776 */ 5777 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5778 struct vmcs12 *vmcs12) 5779 { 5780 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5781 int cr = exit_qualification & 15; 5782 int reg; 5783 unsigned long val; 5784 5785 switch ((exit_qualification >> 4) & 3) { 5786 case 0: /* mov to cr */ 5787 reg = (exit_qualification >> 8) & 15; 5788 val = kvm_register_read(vcpu, reg); 5789 switch (cr) { 5790 case 0: 5791 if (vmcs12->cr0_guest_host_mask & 5792 (val ^ vmcs12->cr0_read_shadow)) 5793 return true; 5794 break; 5795 case 3: 5796 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5797 return true; 5798 break; 5799 case 4: 5800 if (vmcs12->cr4_guest_host_mask & 5801 (vmcs12->cr4_read_shadow ^ val)) 5802 return true; 5803 break; 5804 case 8: 5805 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5806 return true; 5807 break; 5808 } 5809 break; 5810 case 2: /* clts */ 5811 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5812 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5813 return true; 5814 break; 5815 case 1: /* mov from cr */ 5816 switch (cr) { 5817 case 3: 5818 if (vmcs12->cpu_based_vm_exec_control & 5819 CPU_BASED_CR3_STORE_EXITING) 5820 return true; 5821 break; 5822 case 8: 5823 if (vmcs12->cpu_based_vm_exec_control & 5824 CPU_BASED_CR8_STORE_EXITING) 5825 return true; 5826 break; 5827 } 5828 break; 5829 case 3: /* lmsw */ 5830 /* 5831 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5832 * cr0. Other attempted changes are ignored, with no exit. 5833 */ 5834 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5835 if (vmcs12->cr0_guest_host_mask & 0xe & 5836 (val ^ vmcs12->cr0_read_shadow)) 5837 return true; 5838 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5839 !(vmcs12->cr0_read_shadow & 0x1) && 5840 (val & 0x1)) 5841 return true; 5842 break; 5843 } 5844 return false; 5845 } 5846 5847 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 5848 struct vmcs12 *vmcs12) 5849 { 5850 u32 encls_leaf; 5851 5852 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 5853 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 5854 return false; 5855 5856 encls_leaf = kvm_rax_read(vcpu); 5857 if (encls_leaf > 62) 5858 encls_leaf = 63; 5859 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 5860 } 5861 5862 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5863 struct vmcs12 *vmcs12, gpa_t bitmap) 5864 { 5865 u32 vmx_instruction_info; 5866 unsigned long field; 5867 u8 b; 5868 5869 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5870 return true; 5871 5872 /* Decode instruction info and find the field to access */ 5873 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5874 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5875 5876 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5877 if (field >> 15) 5878 return true; 5879 5880 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5881 return true; 5882 5883 return 1 & (b >> (field & 7)); 5884 } 5885 5886 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 5887 { 5888 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 5889 5890 if (nested_cpu_has_mtf(vmcs12)) 5891 return true; 5892 5893 /* 5894 * An MTF VM-exit may be injected into the guest by setting the 5895 * interruption-type to 7 (other event) and the vector field to 0. Such 5896 * is the case regardless of the 'monitor trap flag' VM-execution 5897 * control. 5898 */ 5899 return entry_intr_info == (INTR_INFO_VALID_MASK 5900 | INTR_TYPE_OTHER_EVENT); 5901 } 5902 5903 /* 5904 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 5905 * L1 wants the exit. Only call this when in is_guest_mode (L2). 5906 */ 5907 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 5908 union vmx_exit_reason exit_reason) 5909 { 5910 u32 intr_info; 5911 5912 switch ((u16)exit_reason.basic) { 5913 case EXIT_REASON_EXCEPTION_NMI: 5914 intr_info = vmx_get_intr_info(vcpu); 5915 if (is_nmi(intr_info)) 5916 return true; 5917 else if (is_page_fault(intr_info)) 5918 return vcpu->arch.apf.host_apf_flags || 5919 vmx_need_pf_intercept(vcpu); 5920 else if (is_debug(intr_info) && 5921 vcpu->guest_debug & 5922 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5923 return true; 5924 else if (is_breakpoint(intr_info) && 5925 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5926 return true; 5927 else if (is_alignment_check(intr_info) && 5928 !vmx_guest_inject_ac(vcpu)) 5929 return true; 5930 return false; 5931 case EXIT_REASON_EXTERNAL_INTERRUPT: 5932 return true; 5933 case EXIT_REASON_MCE_DURING_VMENTRY: 5934 return true; 5935 case EXIT_REASON_EPT_VIOLATION: 5936 /* 5937 * L0 always deals with the EPT violation. If nested EPT is 5938 * used, and the nested mmu code discovers that the address is 5939 * missing in the guest EPT table (EPT12), the EPT violation 5940 * will be injected with nested_ept_inject_page_fault() 5941 */ 5942 return true; 5943 case EXIT_REASON_EPT_MISCONFIG: 5944 /* 5945 * L2 never uses directly L1's EPT, but rather L0's own EPT 5946 * table (shadow on EPT) or a merged EPT table that L0 built 5947 * (EPT on EPT). So any problems with the structure of the 5948 * table is L0's fault. 5949 */ 5950 return true; 5951 case EXIT_REASON_PREEMPTION_TIMER: 5952 return true; 5953 case EXIT_REASON_PML_FULL: 5954 /* 5955 * PML is emulated for an L1 VMM and should never be enabled in 5956 * vmcs02, always "handle" PML_FULL by exiting to userspace. 5957 */ 5958 return true; 5959 case EXIT_REASON_VMFUNC: 5960 /* VM functions are emulated through L2->L0 vmexits. */ 5961 return true; 5962 case EXIT_REASON_BUS_LOCK: 5963 /* 5964 * At present, bus lock VM exit is never exposed to L1. 5965 * Handle L2's bus locks in L0 directly. 5966 */ 5967 return true; 5968 default: 5969 break; 5970 } 5971 return false; 5972 } 5973 5974 /* 5975 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 5976 * is_guest_mode (L2). 5977 */ 5978 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 5979 union vmx_exit_reason exit_reason) 5980 { 5981 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5982 u32 intr_info; 5983 5984 switch ((u16)exit_reason.basic) { 5985 case EXIT_REASON_EXCEPTION_NMI: 5986 intr_info = vmx_get_intr_info(vcpu); 5987 if (is_nmi(intr_info)) 5988 return true; 5989 else if (is_page_fault(intr_info)) 5990 return true; 5991 return vmcs12->exception_bitmap & 5992 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5993 case EXIT_REASON_EXTERNAL_INTERRUPT: 5994 return nested_exit_on_intr(vcpu); 5995 case EXIT_REASON_TRIPLE_FAULT: 5996 return true; 5997 case EXIT_REASON_INTERRUPT_WINDOW: 5998 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 5999 case EXIT_REASON_NMI_WINDOW: 6000 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6001 case EXIT_REASON_TASK_SWITCH: 6002 return true; 6003 case EXIT_REASON_CPUID: 6004 return true; 6005 case EXIT_REASON_HLT: 6006 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6007 case EXIT_REASON_INVD: 6008 return true; 6009 case EXIT_REASON_INVLPG: 6010 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6011 case EXIT_REASON_RDPMC: 6012 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6013 case EXIT_REASON_RDRAND: 6014 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6015 case EXIT_REASON_RDSEED: 6016 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6017 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6018 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6019 case EXIT_REASON_VMREAD: 6020 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6021 vmcs12->vmread_bitmap); 6022 case EXIT_REASON_VMWRITE: 6023 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6024 vmcs12->vmwrite_bitmap); 6025 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6026 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6027 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6028 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6029 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6030 /* 6031 * VMX instructions trap unconditionally. This allows L1 to 6032 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6033 */ 6034 return true; 6035 case EXIT_REASON_CR_ACCESS: 6036 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6037 case EXIT_REASON_DR_ACCESS: 6038 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6039 case EXIT_REASON_IO_INSTRUCTION: 6040 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6041 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6042 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6043 case EXIT_REASON_MSR_READ: 6044 case EXIT_REASON_MSR_WRITE: 6045 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6046 case EXIT_REASON_INVALID_STATE: 6047 return true; 6048 case EXIT_REASON_MWAIT_INSTRUCTION: 6049 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6050 case EXIT_REASON_MONITOR_TRAP_FLAG: 6051 return nested_vmx_exit_handled_mtf(vmcs12); 6052 case EXIT_REASON_MONITOR_INSTRUCTION: 6053 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6054 case EXIT_REASON_PAUSE_INSTRUCTION: 6055 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6056 nested_cpu_has2(vmcs12, 6057 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6058 case EXIT_REASON_MCE_DURING_VMENTRY: 6059 return true; 6060 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6061 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6062 case EXIT_REASON_APIC_ACCESS: 6063 case EXIT_REASON_APIC_WRITE: 6064 case EXIT_REASON_EOI_INDUCED: 6065 /* 6066 * The controls for "virtualize APIC accesses," "APIC- 6067 * register virtualization," and "virtual-interrupt 6068 * delivery" only come from vmcs12. 6069 */ 6070 return true; 6071 case EXIT_REASON_INVPCID: 6072 return 6073 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6074 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6075 case EXIT_REASON_WBINVD: 6076 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6077 case EXIT_REASON_XSETBV: 6078 return true; 6079 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6080 /* 6081 * This should never happen, since it is not possible to 6082 * set XSS to a non-zero value---neither in L1 nor in L2. 6083 * If if it were, XSS would have to be checked against 6084 * the XSS exit bitmap in vmcs12. 6085 */ 6086 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 6087 case EXIT_REASON_UMWAIT: 6088 case EXIT_REASON_TPAUSE: 6089 return nested_cpu_has2(vmcs12, 6090 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6091 case EXIT_REASON_ENCLS: 6092 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6093 case EXIT_REASON_NOTIFY: 6094 /* Notify VM exit is not exposed to L1 */ 6095 return false; 6096 default: 6097 return true; 6098 } 6099 } 6100 6101 /* 6102 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6103 * reflected into L1. 6104 */ 6105 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6106 { 6107 struct vcpu_vmx *vmx = to_vmx(vcpu); 6108 union vmx_exit_reason exit_reason = vmx->exit_reason; 6109 unsigned long exit_qual; 6110 u32 exit_intr_info; 6111 6112 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6113 6114 /* 6115 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6116 * has already loaded L2's state. 6117 */ 6118 if (unlikely(vmx->fail)) { 6119 trace_kvm_nested_vmenter_failed( 6120 "hardware VM-instruction error: ", 6121 vmcs_read32(VM_INSTRUCTION_ERROR)); 6122 exit_intr_info = 0; 6123 exit_qual = 0; 6124 goto reflect_vmexit; 6125 } 6126 6127 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6128 6129 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6130 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6131 return false; 6132 6133 /* If L1 doesn't want the exit, handle it in L0. */ 6134 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6135 return false; 6136 6137 /* 6138 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6139 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6140 * need to be synthesized by querying the in-kernel LAPIC, but external 6141 * interrupts are never reflected to L1 so it's a non-issue. 6142 */ 6143 exit_intr_info = vmx_get_intr_info(vcpu); 6144 if (is_exception_with_error_code(exit_intr_info)) { 6145 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6146 6147 vmcs12->vm_exit_intr_error_code = 6148 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6149 } 6150 exit_qual = vmx_get_exit_qual(vcpu); 6151 6152 reflect_vmexit: 6153 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6154 return true; 6155 } 6156 6157 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6158 struct kvm_nested_state __user *user_kvm_nested_state, 6159 u32 user_data_size) 6160 { 6161 struct vcpu_vmx *vmx; 6162 struct vmcs12 *vmcs12; 6163 struct kvm_nested_state kvm_state = { 6164 .flags = 0, 6165 .format = KVM_STATE_NESTED_FORMAT_VMX, 6166 .size = sizeof(kvm_state), 6167 .hdr.vmx.flags = 0, 6168 .hdr.vmx.vmxon_pa = INVALID_GPA, 6169 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6170 .hdr.vmx.preemption_timer_deadline = 0, 6171 }; 6172 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6173 &user_kvm_nested_state->data.vmx[0]; 6174 6175 if (!vcpu) 6176 return kvm_state.size + sizeof(*user_vmx_nested_state); 6177 6178 vmx = to_vmx(vcpu); 6179 vmcs12 = get_vmcs12(vcpu); 6180 6181 if (nested_vmx_allowed(vcpu) && 6182 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6183 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6184 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6185 6186 if (vmx_has_valid_vmcs12(vcpu)) { 6187 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6188 6189 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6190 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 6191 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6192 6193 if (is_guest_mode(vcpu) && 6194 nested_cpu_has_shadow_vmcs(vmcs12) && 6195 vmcs12->vmcs_link_pointer != INVALID_GPA) 6196 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6197 } 6198 6199 if (vmx->nested.smm.vmxon) 6200 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6201 6202 if (vmx->nested.smm.guest_mode) 6203 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6204 6205 if (is_guest_mode(vcpu)) { 6206 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6207 6208 if (vmx->nested.nested_run_pending) 6209 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6210 6211 if (vmx->nested.mtf_pending) 6212 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6213 6214 if (nested_cpu_has_preemption_timer(vmcs12) && 6215 vmx->nested.has_preemption_timer_deadline) { 6216 kvm_state.hdr.vmx.flags |= 6217 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6218 kvm_state.hdr.vmx.preemption_timer_deadline = 6219 vmx->nested.preemption_timer_deadline; 6220 } 6221 } 6222 } 6223 6224 if (user_data_size < kvm_state.size) 6225 goto out; 6226 6227 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6228 return -EFAULT; 6229 6230 if (!vmx_has_valid_vmcs12(vcpu)) 6231 goto out; 6232 6233 /* 6234 * When running L2, the authoritative vmcs12 state is in the 6235 * vmcs02. When running L1, the authoritative vmcs12 state is 6236 * in the shadow or enlightened vmcs linked to vmcs01, unless 6237 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6238 * vmcs12 state is in the vmcs12 already. 6239 */ 6240 if (is_guest_mode(vcpu)) { 6241 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6242 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6243 } else { 6244 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6245 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6246 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 6247 /* 6248 * L1 hypervisor is not obliged to keep eVMCS 6249 * clean fields data always up-to-date while 6250 * not in guest mode, 'hv_clean_fields' is only 6251 * supposed to be actual upon vmentry so we need 6252 * to ignore it here and do full copy. 6253 */ 6254 copy_enlightened_to_vmcs12(vmx, 0); 6255 else if (enable_shadow_vmcs) 6256 copy_shadow_to_vmcs12(vmx); 6257 } 6258 } 6259 6260 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6261 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6262 6263 /* 6264 * Copy over the full allocated size of vmcs12 rather than just the size 6265 * of the struct. 6266 */ 6267 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6268 return -EFAULT; 6269 6270 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6271 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6272 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6273 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6274 return -EFAULT; 6275 } 6276 out: 6277 return kvm_state.size; 6278 } 6279 6280 /* 6281 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 6282 */ 6283 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6284 { 6285 if (is_guest_mode(vcpu)) { 6286 to_vmx(vcpu)->nested.nested_run_pending = 0; 6287 nested_vmx_vmexit(vcpu, -1, 0, 0); 6288 } 6289 free_nested(vcpu); 6290 } 6291 6292 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6293 struct kvm_nested_state __user *user_kvm_nested_state, 6294 struct kvm_nested_state *kvm_state) 6295 { 6296 struct vcpu_vmx *vmx = to_vmx(vcpu); 6297 struct vmcs12 *vmcs12; 6298 enum vm_entry_failure_code ignored; 6299 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6300 &user_kvm_nested_state->data.vmx[0]; 6301 int ret; 6302 6303 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6304 return -EINVAL; 6305 6306 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6307 if (kvm_state->hdr.vmx.smm.flags) 6308 return -EINVAL; 6309 6310 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6311 return -EINVAL; 6312 6313 /* 6314 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6315 * enable eVMCS capability on vCPU. However, since then 6316 * code was changed such that flag signals vmcs12 should 6317 * be copied into eVMCS in guest memory. 6318 * 6319 * To preserve backwards compatability, allow user 6320 * to set this flag even when there is no VMXON region. 6321 */ 6322 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6323 return -EINVAL; 6324 } else { 6325 if (!nested_vmx_allowed(vcpu)) 6326 return -EINVAL; 6327 6328 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6329 return -EINVAL; 6330 } 6331 6332 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6333 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6334 return -EINVAL; 6335 6336 if (kvm_state->hdr.vmx.smm.flags & 6337 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6338 return -EINVAL; 6339 6340 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6341 return -EINVAL; 6342 6343 /* 6344 * SMM temporarily disables VMX, so we cannot be in guest mode, 6345 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6346 * must be zero. 6347 */ 6348 if (is_smm(vcpu) ? 6349 (kvm_state->flags & 6350 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6351 : kvm_state->hdr.vmx.smm.flags) 6352 return -EINVAL; 6353 6354 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6355 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6356 return -EINVAL; 6357 6358 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6359 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6360 return -EINVAL; 6361 6362 vmx_leave_nested(vcpu); 6363 6364 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6365 return 0; 6366 6367 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6368 ret = enter_vmx_operation(vcpu); 6369 if (ret) 6370 return ret; 6371 6372 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6373 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6374 /* See vmx_has_valid_vmcs12. */ 6375 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6376 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6377 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6378 return -EINVAL; 6379 else 6380 return 0; 6381 } 6382 6383 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6384 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6385 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6386 return -EINVAL; 6387 6388 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6389 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6390 /* 6391 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6392 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6393 * restored yet. EVMCS will be mapped from 6394 * nested_get_vmcs12_pages(). 6395 */ 6396 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6397 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6398 } else { 6399 return -EINVAL; 6400 } 6401 6402 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6403 vmx->nested.smm.vmxon = true; 6404 vmx->nested.vmxon = false; 6405 6406 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6407 vmx->nested.smm.guest_mode = true; 6408 } 6409 6410 vmcs12 = get_vmcs12(vcpu); 6411 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6412 return -EFAULT; 6413 6414 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6415 return -EINVAL; 6416 6417 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6418 return 0; 6419 6420 vmx->nested.nested_run_pending = 6421 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6422 6423 vmx->nested.mtf_pending = 6424 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6425 6426 ret = -EINVAL; 6427 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6428 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6429 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6430 6431 if (kvm_state->size < 6432 sizeof(*kvm_state) + 6433 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6434 goto error_guest_mode; 6435 6436 if (copy_from_user(shadow_vmcs12, 6437 user_vmx_nested_state->shadow_vmcs12, 6438 sizeof(*shadow_vmcs12))) { 6439 ret = -EFAULT; 6440 goto error_guest_mode; 6441 } 6442 6443 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6444 !shadow_vmcs12->hdr.shadow_vmcs) 6445 goto error_guest_mode; 6446 } 6447 6448 vmx->nested.has_preemption_timer_deadline = false; 6449 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6450 vmx->nested.has_preemption_timer_deadline = true; 6451 vmx->nested.preemption_timer_deadline = 6452 kvm_state->hdr.vmx.preemption_timer_deadline; 6453 } 6454 6455 if (nested_vmx_check_controls(vcpu, vmcs12) || 6456 nested_vmx_check_host_state(vcpu, vmcs12) || 6457 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6458 goto error_guest_mode; 6459 6460 vmx->nested.dirty_vmcs12 = true; 6461 vmx->nested.force_msr_bitmap_recalc = true; 6462 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6463 if (ret) 6464 goto error_guest_mode; 6465 6466 return 0; 6467 6468 error_guest_mode: 6469 vmx->nested.nested_run_pending = 0; 6470 return ret; 6471 } 6472 6473 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6474 { 6475 if (enable_shadow_vmcs) { 6476 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6477 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6478 } 6479 } 6480 6481 /* 6482 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6483 * that madness to get the encoding for comparison. 6484 */ 6485 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6486 6487 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6488 { 6489 /* 6490 * Note these are the so called "index" of the VMCS field encoding, not 6491 * the index into vmcs12. 6492 */ 6493 unsigned int max_idx, idx; 6494 int i; 6495 6496 /* 6497 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6498 * vmcs12, regardless of whether or not the associated feature is 6499 * exposed to L1. Simply find the field with the highest index. 6500 */ 6501 max_idx = 0; 6502 for (i = 0; i < nr_vmcs12_fields; i++) { 6503 /* The vmcs12 table is very, very sparsely populated. */ 6504 if (!vmcs12_field_offsets[i]) 6505 continue; 6506 6507 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6508 if (idx > max_idx) 6509 max_idx = idx; 6510 } 6511 6512 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6513 } 6514 6515 /* 6516 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6517 * returned for the various VMX controls MSRs when nested VMX is enabled. 6518 * The same values should also be used to verify that vmcs12 control fields are 6519 * valid during nested entry from L1 to L2. 6520 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6521 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6522 * bit in the high half is on if the corresponding bit in the control field 6523 * may be on. See also vmx_control_verify(). 6524 */ 6525 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) 6526 { 6527 /* 6528 * Note that as a general rule, the high half of the MSRs (bits in 6529 * the control fields which may be 1) should be initialized by the 6530 * intersection of the underlying hardware's MSR (i.e., features which 6531 * can be supported) and the list of features we want to expose - 6532 * because they are known to be properly supported in our code. 6533 * Also, usually, the low half of the MSRs (bits which must be 1) can 6534 * be set to 0, meaning that L1 may turn off any of these bits. The 6535 * reason is that if one of these bits is necessary, it will appear 6536 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6537 * fields of vmcs01 and vmcs02, will turn these bits off - and 6538 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6539 * These rules have exceptions below. 6540 */ 6541 6542 /* pin-based controls */ 6543 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 6544 msrs->pinbased_ctls_low, 6545 msrs->pinbased_ctls_high); 6546 msrs->pinbased_ctls_low |= 6547 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6548 msrs->pinbased_ctls_high &= 6549 PIN_BASED_EXT_INTR_MASK | 6550 PIN_BASED_NMI_EXITING | 6551 PIN_BASED_VIRTUAL_NMIS | 6552 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6553 msrs->pinbased_ctls_high |= 6554 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6555 PIN_BASED_VMX_PREEMPTION_TIMER; 6556 6557 /* exit controls */ 6558 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 6559 msrs->exit_ctls_low, 6560 msrs->exit_ctls_high); 6561 msrs->exit_ctls_low = 6562 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6563 6564 msrs->exit_ctls_high &= 6565 #ifdef CONFIG_X86_64 6566 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6567 #endif 6568 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6569 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6570 msrs->exit_ctls_high |= 6571 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6572 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6573 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 6574 6575 /* We support free control of debug control saving. */ 6576 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6577 6578 /* entry controls */ 6579 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 6580 msrs->entry_ctls_low, 6581 msrs->entry_ctls_high); 6582 msrs->entry_ctls_low = 6583 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6584 msrs->entry_ctls_high &= 6585 #ifdef CONFIG_X86_64 6586 VM_ENTRY_IA32E_MODE | 6587 #endif 6588 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 6589 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 6590 msrs->entry_ctls_high |= 6591 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6592 6593 /* We support free control of debug control loading. */ 6594 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6595 6596 /* cpu-based controls */ 6597 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6598 msrs->procbased_ctls_low, 6599 msrs->procbased_ctls_high); 6600 msrs->procbased_ctls_low = 6601 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6602 msrs->procbased_ctls_high &= 6603 CPU_BASED_INTR_WINDOW_EXITING | 6604 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6605 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6606 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6607 CPU_BASED_CR3_STORE_EXITING | 6608 #ifdef CONFIG_X86_64 6609 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6610 #endif 6611 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6612 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6613 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6614 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6615 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6616 /* 6617 * We can allow some features even when not supported by the 6618 * hardware. For example, L1 can specify an MSR bitmap - and we 6619 * can use it to avoid exits to L1 - even when L0 runs L2 6620 * without MSR bitmaps. 6621 */ 6622 msrs->procbased_ctls_high |= 6623 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6624 CPU_BASED_USE_MSR_BITMAPS; 6625 6626 /* We support free control of CR3 access interception. */ 6627 msrs->procbased_ctls_low &= 6628 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6629 6630 /* 6631 * secondary cpu-based controls. Do not include those that 6632 * depend on CPUID bits, they are added later by 6633 * vmx_vcpu_after_set_cpuid. 6634 */ 6635 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6636 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6637 msrs->secondary_ctls_low, 6638 msrs->secondary_ctls_high); 6639 6640 msrs->secondary_ctls_low = 0; 6641 msrs->secondary_ctls_high &= 6642 SECONDARY_EXEC_DESC | 6643 SECONDARY_EXEC_ENABLE_RDTSCP | 6644 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6645 SECONDARY_EXEC_WBINVD_EXITING | 6646 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6647 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6648 SECONDARY_EXEC_RDRAND_EXITING | 6649 SECONDARY_EXEC_ENABLE_INVPCID | 6650 SECONDARY_EXEC_RDSEED_EXITING | 6651 SECONDARY_EXEC_XSAVES | 6652 SECONDARY_EXEC_TSC_SCALING; 6653 6654 /* 6655 * We can emulate "VMCS shadowing," even if the hardware 6656 * doesn't support it. 6657 */ 6658 msrs->secondary_ctls_high |= 6659 SECONDARY_EXEC_SHADOW_VMCS; 6660 6661 if (enable_ept) { 6662 /* nested EPT: emulate EPT also to L1 */ 6663 msrs->secondary_ctls_high |= 6664 SECONDARY_EXEC_ENABLE_EPT; 6665 msrs->ept_caps = 6666 VMX_EPT_PAGE_WALK_4_BIT | 6667 VMX_EPT_PAGE_WALK_5_BIT | 6668 VMX_EPTP_WB_BIT | 6669 VMX_EPT_INVEPT_BIT | 6670 VMX_EPT_EXECUTE_ONLY_BIT; 6671 6672 msrs->ept_caps &= ept_caps; 6673 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6674 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6675 VMX_EPT_1GB_PAGE_BIT; 6676 if (enable_ept_ad_bits) { 6677 msrs->secondary_ctls_high |= 6678 SECONDARY_EXEC_ENABLE_PML; 6679 msrs->ept_caps |= VMX_EPT_AD_BIT; 6680 } 6681 } 6682 6683 if (cpu_has_vmx_vmfunc()) { 6684 msrs->secondary_ctls_high |= 6685 SECONDARY_EXEC_ENABLE_VMFUNC; 6686 /* 6687 * Advertise EPTP switching unconditionally 6688 * since we emulate it 6689 */ 6690 if (enable_ept) 6691 msrs->vmfunc_controls = 6692 VMX_VMFUNC_EPTP_SWITCHING; 6693 } 6694 6695 /* 6696 * Old versions of KVM use the single-context version without 6697 * checking for support, so declare that it is supported even 6698 * though it is treated as global context. The alternative is 6699 * not failing the single-context invvpid, and it is worse. 6700 */ 6701 if (enable_vpid) { 6702 msrs->secondary_ctls_high |= 6703 SECONDARY_EXEC_ENABLE_VPID; 6704 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6705 VMX_VPID_EXTENT_SUPPORTED_MASK; 6706 } 6707 6708 if (enable_unrestricted_guest) 6709 msrs->secondary_ctls_high |= 6710 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6711 6712 if (flexpriority_enabled) 6713 msrs->secondary_ctls_high |= 6714 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6715 6716 if (enable_sgx) 6717 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6718 6719 /* miscellaneous data */ 6720 rdmsr(MSR_IA32_VMX_MISC, 6721 msrs->misc_low, 6722 msrs->misc_high); 6723 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6724 msrs->misc_low |= 6725 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6726 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6727 VMX_MISC_ACTIVITY_HLT | 6728 VMX_MISC_ACTIVITY_WAIT_SIPI; 6729 msrs->misc_high = 0; 6730 6731 /* 6732 * This MSR reports some information about VMX support. We 6733 * should return information about the VMX we emulate for the 6734 * guest, and the VMCS structure we give it - not about the 6735 * VMX support of the underlying hardware. 6736 */ 6737 msrs->basic = 6738 VMCS12_REVISION | 6739 VMX_BASIC_TRUE_CTLS | 6740 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6741 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6742 6743 if (cpu_has_vmx_basic_inout()) 6744 msrs->basic |= VMX_BASIC_INOUT; 6745 6746 /* 6747 * These MSRs specify bits which the guest must keep fixed on 6748 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6749 * We picked the standard core2 setting. 6750 */ 6751 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6752 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6753 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6754 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6755 6756 /* These MSRs specify bits which the guest must keep fixed off. */ 6757 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6758 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6759 6760 if (vmx_umip_emulated()) 6761 msrs->cr4_fixed1 |= X86_CR4_UMIP; 6762 6763 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 6764 } 6765 6766 void nested_vmx_hardware_unsetup(void) 6767 { 6768 int i; 6769 6770 if (enable_shadow_vmcs) { 6771 for (i = 0; i < VMX_BITMAP_NR; i++) 6772 free_page((unsigned long)vmx_bitmap[i]); 6773 } 6774 } 6775 6776 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6777 { 6778 int i; 6779 6780 if (!cpu_has_vmx_shadow_vmcs()) 6781 enable_shadow_vmcs = 0; 6782 if (enable_shadow_vmcs) { 6783 for (i = 0; i < VMX_BITMAP_NR; i++) { 6784 /* 6785 * The vmx_bitmap is not tied to a VM and so should 6786 * not be charged to a memcg. 6787 */ 6788 vmx_bitmap[i] = (unsigned long *) 6789 __get_free_page(GFP_KERNEL); 6790 if (!vmx_bitmap[i]) { 6791 nested_vmx_hardware_unsetup(); 6792 return -ENOMEM; 6793 } 6794 } 6795 6796 init_vmcs_shadow_fields(); 6797 } 6798 6799 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6800 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6801 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6802 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6803 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6804 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6805 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6806 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 6807 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 6808 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6809 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6810 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6811 6812 return 0; 6813 } 6814 6815 struct kvm_x86_nested_ops vmx_nested_ops = { 6816 .leave_nested = vmx_leave_nested, 6817 .check_events = vmx_check_nested_events, 6818 .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround, 6819 .hv_timer_pending = nested_vmx_preemption_timer_pending, 6820 .triple_fault = nested_vmx_triple_fault, 6821 .get_state = vmx_get_nested_state, 6822 .set_state = vmx_set_nested_state, 6823 .get_nested_state_pages = vmx_get_nested_state_pages, 6824 .write_log_dirty = nested_vmx_write_pml_buffer, 6825 .enable_evmcs = nested_enable_evmcs, 6826 .get_evmcs_version = nested_get_evmcs_version, 6827 }; 6828