1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/objtool.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "evmcs.h" 11 #include "hyperv.h" 12 #include "mmu.h" 13 #include "nested.h" 14 #include "pmu.h" 15 #include "sgx.h" 16 #include "trace.h" 17 #include "vmx.h" 18 #include "x86.h" 19 20 static bool __read_mostly enable_shadow_vmcs = 1; 21 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 22 23 static bool __read_mostly nested_early_check = 0; 24 module_param(nested_early_check, bool, S_IRUGO); 25 26 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 27 28 /* 29 * Hyper-V requires all of these, so mark them as supported even though 30 * they are just treated the same as all-context. 31 */ 32 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 33 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 34 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 35 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 36 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 37 38 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 39 40 enum { 41 VMX_VMREAD_BITMAP, 42 VMX_VMWRITE_BITMAP, 43 VMX_BITMAP_NR 44 }; 45 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 46 47 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 48 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 49 50 struct shadow_vmcs_field { 51 u16 encoding; 52 u16 offset; 53 }; 54 static struct shadow_vmcs_field shadow_read_only_fields[] = { 55 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 56 #include "vmcs_shadow_fields.h" 57 }; 58 static int max_shadow_read_only_fields = 59 ARRAY_SIZE(shadow_read_only_fields); 60 61 static struct shadow_vmcs_field shadow_read_write_fields[] = { 62 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 63 #include "vmcs_shadow_fields.h" 64 }; 65 static int max_shadow_read_write_fields = 66 ARRAY_SIZE(shadow_read_write_fields); 67 68 static void init_vmcs_shadow_fields(void) 69 { 70 int i, j; 71 72 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 73 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 74 75 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 76 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 77 u16 field = entry.encoding; 78 79 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 80 (i + 1 == max_shadow_read_only_fields || 81 shadow_read_only_fields[i + 1].encoding != field + 1)) 82 pr_err("Missing field from shadow_read_only_field %x\n", 83 field + 1); 84 85 clear_bit(field, vmx_vmread_bitmap); 86 if (field & 1) 87 #ifdef CONFIG_X86_64 88 continue; 89 #else 90 entry.offset += sizeof(u32); 91 #endif 92 shadow_read_only_fields[j++] = entry; 93 } 94 max_shadow_read_only_fields = j; 95 96 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 97 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 98 u16 field = entry.encoding; 99 100 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 101 (i + 1 == max_shadow_read_write_fields || 102 shadow_read_write_fields[i + 1].encoding != field + 1)) 103 pr_err("Missing field from shadow_read_write_field %x\n", 104 field + 1); 105 106 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 107 field <= GUEST_TR_AR_BYTES, 108 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 109 110 /* 111 * PML and the preemption timer can be emulated, but the 112 * processor cannot vmwrite to fields that don't exist 113 * on bare metal. 114 */ 115 switch (field) { 116 case GUEST_PML_INDEX: 117 if (!cpu_has_vmx_pml()) 118 continue; 119 break; 120 case VMX_PREEMPTION_TIMER_VALUE: 121 if (!cpu_has_vmx_preemption_timer()) 122 continue; 123 break; 124 case GUEST_INTR_STATUS: 125 if (!cpu_has_vmx_apicv()) 126 continue; 127 break; 128 default: 129 break; 130 } 131 132 clear_bit(field, vmx_vmwrite_bitmap); 133 clear_bit(field, vmx_vmread_bitmap); 134 if (field & 1) 135 #ifdef CONFIG_X86_64 136 continue; 137 #else 138 entry.offset += sizeof(u32); 139 #endif 140 shadow_read_write_fields[j++] = entry; 141 } 142 max_shadow_read_write_fields = j; 143 } 144 145 /* 146 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 147 * set the success or error code of an emulated VMX instruction (as specified 148 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 149 * instruction. 150 */ 151 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 152 { 153 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 154 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 155 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 156 return kvm_skip_emulated_instruction(vcpu); 157 } 158 159 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 160 { 161 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 162 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 163 X86_EFLAGS_SF | X86_EFLAGS_OF)) 164 | X86_EFLAGS_CF); 165 return kvm_skip_emulated_instruction(vcpu); 166 } 167 168 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 169 u32 vm_instruction_error) 170 { 171 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 172 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 173 X86_EFLAGS_SF | X86_EFLAGS_OF)) 174 | X86_EFLAGS_ZF); 175 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 176 /* 177 * We don't need to force sync to shadow VMCS because 178 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 179 * fields and thus must be synced. 180 */ 181 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 182 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 183 184 return kvm_skip_emulated_instruction(vcpu); 185 } 186 187 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 188 { 189 struct vcpu_vmx *vmx = to_vmx(vcpu); 190 191 /* 192 * failValid writes the error number to the current VMCS, which 193 * can't be done if there isn't a current VMCS. 194 */ 195 if (vmx->nested.current_vmptr == INVALID_GPA && 196 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 197 return nested_vmx_failInvalid(vcpu); 198 199 return nested_vmx_failValid(vcpu, vm_instruction_error); 200 } 201 202 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 203 { 204 /* TODO: not to reset guest simply here. */ 205 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 206 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 207 } 208 209 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 210 { 211 return fixed_bits_valid(control, low, high); 212 } 213 214 static inline u64 vmx_control_msr(u32 low, u32 high) 215 { 216 return low | ((u64)high << 32); 217 } 218 219 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 220 { 221 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 222 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 223 vmx->nested.need_vmcs12_to_shadow_sync = false; 224 } 225 226 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 227 { 228 struct vcpu_vmx *vmx = to_vmx(vcpu); 229 230 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 231 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 232 vmx->nested.hv_evmcs = NULL; 233 } 234 235 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 236 } 237 238 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 239 struct loaded_vmcs *prev) 240 { 241 struct vmcs_host_state *dest, *src; 242 243 if (unlikely(!vmx->guest_state_loaded)) 244 return; 245 246 src = &prev->host_state; 247 dest = &vmx->loaded_vmcs->host_state; 248 249 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 250 dest->ldt_sel = src->ldt_sel; 251 #ifdef CONFIG_X86_64 252 dest->ds_sel = src->ds_sel; 253 dest->es_sel = src->es_sel; 254 #endif 255 } 256 257 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 258 { 259 struct vcpu_vmx *vmx = to_vmx(vcpu); 260 struct loaded_vmcs *prev; 261 int cpu; 262 263 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 264 return; 265 266 cpu = get_cpu(); 267 prev = vmx->loaded_vmcs; 268 vmx->loaded_vmcs = vmcs; 269 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 270 vmx_sync_vmcs_host_state(vmx, prev); 271 put_cpu(); 272 273 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 274 275 /* 276 * All lazily updated registers will be reloaded from VMCS12 on both 277 * vmentry and vmexit. 278 */ 279 vcpu->arch.regs_dirty = 0; 280 } 281 282 /* 283 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 284 * just stops using VMX. 285 */ 286 static void free_nested(struct kvm_vcpu *vcpu) 287 { 288 struct vcpu_vmx *vmx = to_vmx(vcpu); 289 290 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 291 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 292 293 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 294 return; 295 296 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 297 298 vmx->nested.vmxon = false; 299 vmx->nested.smm.vmxon = false; 300 vmx->nested.vmxon_ptr = INVALID_GPA; 301 free_vpid(vmx->nested.vpid02); 302 vmx->nested.posted_intr_nv = -1; 303 vmx->nested.current_vmptr = INVALID_GPA; 304 if (enable_shadow_vmcs) { 305 vmx_disable_shadow_vmcs(vmx); 306 vmcs_clear(vmx->vmcs01.shadow_vmcs); 307 free_vmcs(vmx->vmcs01.shadow_vmcs); 308 vmx->vmcs01.shadow_vmcs = NULL; 309 } 310 kfree(vmx->nested.cached_vmcs12); 311 vmx->nested.cached_vmcs12 = NULL; 312 kfree(vmx->nested.cached_shadow_vmcs12); 313 vmx->nested.cached_shadow_vmcs12 = NULL; 314 /* Unpin physical memory we referred to in the vmcs02 */ 315 if (vmx->nested.apic_access_page) { 316 kvm_release_page_clean(vmx->nested.apic_access_page); 317 vmx->nested.apic_access_page = NULL; 318 } 319 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 320 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 321 vmx->nested.pi_desc = NULL; 322 323 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 324 325 nested_release_evmcs(vcpu); 326 327 free_loaded_vmcs(&vmx->nested.vmcs02); 328 } 329 330 /* 331 * Ensure that the current vmcs of the logical processor is the 332 * vmcs01 of the vcpu before calling free_nested(). 333 */ 334 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 335 { 336 vcpu_load(vcpu); 337 vmx_leave_nested(vcpu); 338 vcpu_put(vcpu); 339 } 340 341 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 342 343 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 344 { 345 return VALID_PAGE(root_hpa) && 346 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 347 } 348 349 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 350 gpa_t addr) 351 { 352 uint i; 353 struct kvm_mmu_root_info *cached_root; 354 355 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 356 357 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 358 cached_root = &vcpu->arch.mmu->prev_roots[i]; 359 360 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 361 eptp)) 362 vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); 363 } 364 } 365 366 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 367 struct x86_exception *fault) 368 { 369 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 370 struct vcpu_vmx *vmx = to_vmx(vcpu); 371 u32 vm_exit_reason; 372 unsigned long exit_qualification = vcpu->arch.exit_qualification; 373 374 if (vmx->nested.pml_full) { 375 vm_exit_reason = EXIT_REASON_PML_FULL; 376 vmx->nested.pml_full = false; 377 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 378 } else { 379 if (fault->error_code & PFERR_RSVD_MASK) 380 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 381 else 382 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 383 384 /* 385 * Although the caller (kvm_inject_emulated_page_fault) would 386 * have already synced the faulting address in the shadow EPT 387 * tables for the current EPTP12, we also need to sync it for 388 * any other cached EPTP02s based on the same EP4TA, since the 389 * TLB associates mappings to the EP4TA rather than the full EPTP. 390 */ 391 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 392 fault->address); 393 } 394 395 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 396 vmcs12->guest_physical_address = fault->address; 397 } 398 399 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 400 { 401 struct vcpu_vmx *vmx = to_vmx(vcpu); 402 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 403 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 404 405 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 406 nested_ept_ad_enabled(vcpu), 407 nested_ept_get_eptp(vcpu)); 408 } 409 410 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 411 { 412 WARN_ON(mmu_is_nested(vcpu)); 413 414 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 415 nested_ept_new_eptp(vcpu); 416 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 417 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 418 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 419 420 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 421 } 422 423 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 424 { 425 vcpu->arch.mmu = &vcpu->arch.root_mmu; 426 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 427 } 428 429 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 430 u16 error_code) 431 { 432 bool inequality, bit; 433 434 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 435 inequality = 436 (error_code & vmcs12->page_fault_error_code_mask) != 437 vmcs12->page_fault_error_code_match; 438 return inequality ^ bit; 439 } 440 441 442 /* 443 * KVM wants to inject page-faults which it got to the guest. This function 444 * checks whether in a nested guest, we need to inject them to L1 or L2. 445 */ 446 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 447 { 448 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 449 unsigned int nr = vcpu->arch.exception.nr; 450 bool has_payload = vcpu->arch.exception.has_payload; 451 unsigned long payload = vcpu->arch.exception.payload; 452 453 if (nr == PF_VECTOR) { 454 if (vcpu->arch.exception.nested_apf) { 455 *exit_qual = vcpu->arch.apf.nested_apf_token; 456 return 1; 457 } 458 if (nested_vmx_is_page_fault_vmexit(vmcs12, 459 vcpu->arch.exception.error_code)) { 460 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 461 return 1; 462 } 463 } else if (vmcs12->exception_bitmap & (1u << nr)) { 464 if (nr == DB_VECTOR) { 465 if (!has_payload) { 466 payload = vcpu->arch.dr6; 467 payload &= ~DR6_BT; 468 payload ^= DR6_ACTIVE_LOW; 469 } 470 *exit_qual = payload; 471 } else 472 *exit_qual = 0; 473 return 1; 474 } 475 476 return 0; 477 } 478 479 static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu, 480 struct x86_exception *fault) 481 { 482 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 483 484 WARN_ON(!is_guest_mode(vcpu)); 485 486 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 487 !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) { 488 vmcs12->vm_exit_intr_error_code = fault->error_code; 489 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 490 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 491 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 492 fault->address); 493 return true; 494 } 495 return false; 496 } 497 498 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 499 struct vmcs12 *vmcs12) 500 { 501 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 502 return 0; 503 504 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 505 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 506 return -EINVAL; 507 508 return 0; 509 } 510 511 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 512 struct vmcs12 *vmcs12) 513 { 514 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 515 return 0; 516 517 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 518 return -EINVAL; 519 520 return 0; 521 } 522 523 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 524 struct vmcs12 *vmcs12) 525 { 526 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 527 return 0; 528 529 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 530 return -EINVAL; 531 532 return 0; 533 } 534 535 /* 536 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 537 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 538 * only the "disable intercept" case needs to be handled. 539 */ 540 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 541 unsigned long *msr_bitmap_l0, 542 u32 msr, int type) 543 { 544 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 545 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 546 547 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 548 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 549 } 550 551 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 552 { 553 int msr; 554 555 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 556 unsigned word = msr / BITS_PER_LONG; 557 558 msr_bitmap[word] = ~0; 559 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 560 } 561 } 562 563 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 564 static inline \ 565 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 566 unsigned long *msr_bitmap_l1, \ 567 unsigned long *msr_bitmap_l0, u32 msr) \ 568 { \ 569 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 570 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 571 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 572 else \ 573 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 574 } 575 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 576 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 577 578 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 579 unsigned long *msr_bitmap_l1, 580 unsigned long *msr_bitmap_l0, 581 u32 msr, int types) 582 { 583 if (types & MSR_TYPE_R) 584 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 585 msr_bitmap_l0, msr); 586 if (types & MSR_TYPE_W) 587 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 588 msr_bitmap_l0, msr); 589 } 590 591 /* 592 * Merge L0's and L1's MSR bitmap, return false to indicate that 593 * we do not use the hardware. 594 */ 595 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 596 struct vmcs12 *vmcs12) 597 { 598 struct vcpu_vmx *vmx = to_vmx(vcpu); 599 int msr; 600 unsigned long *msr_bitmap_l1; 601 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 602 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 603 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 604 605 /* Nothing to do if the MSR bitmap is not in use. */ 606 if (!cpu_has_vmx_msr_bitmap() || 607 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 608 return false; 609 610 /* 611 * MSR bitmap update can be skipped when: 612 * - MSR bitmap for L1 hasn't changed. 613 * - Nested hypervisor (L1) is attempting to launch the same L2 as 614 * before. 615 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 616 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 617 */ 618 if (!vmx->nested.force_msr_bitmap_recalc && evmcs && 619 evmcs->hv_enlightenments_control.msr_bitmap && 620 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 621 return true; 622 623 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 624 return false; 625 626 msr_bitmap_l1 = (unsigned long *)map->hva; 627 628 /* 629 * To keep the control flow simple, pay eight 8-byte writes (sixteen 630 * 4-byte writes on 32-bit systems) up front to enable intercepts for 631 * the x2APIC MSR range and selectively toggle those relevant to L2. 632 */ 633 enable_x2apic_msr_intercepts(msr_bitmap_l0); 634 635 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 636 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 637 /* 638 * L0 need not intercept reads for MSRs between 0x800 639 * and 0x8ff, it just lets the processor take the value 640 * from the virtual-APIC page; take those 256 bits 641 * directly from the L1 bitmap. 642 */ 643 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 644 unsigned word = msr / BITS_PER_LONG; 645 646 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 647 } 648 } 649 650 nested_vmx_disable_intercept_for_x2apic_msr( 651 msr_bitmap_l1, msr_bitmap_l0, 652 X2APIC_MSR(APIC_TASKPRI), 653 MSR_TYPE_R | MSR_TYPE_W); 654 655 if (nested_cpu_has_vid(vmcs12)) { 656 nested_vmx_disable_intercept_for_x2apic_msr( 657 msr_bitmap_l1, msr_bitmap_l0, 658 X2APIC_MSR(APIC_EOI), 659 MSR_TYPE_W); 660 nested_vmx_disable_intercept_for_x2apic_msr( 661 msr_bitmap_l1, msr_bitmap_l0, 662 X2APIC_MSR(APIC_SELF_IPI), 663 MSR_TYPE_W); 664 } 665 } 666 667 /* 668 * Always check vmcs01's bitmap to honor userspace MSR filters and any 669 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 670 */ 671 #ifdef CONFIG_X86_64 672 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 673 MSR_FS_BASE, MSR_TYPE_RW); 674 675 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 676 MSR_GS_BASE, MSR_TYPE_RW); 677 678 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 679 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 680 #endif 681 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 682 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 683 684 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 685 MSR_IA32_PRED_CMD, MSR_TYPE_W); 686 687 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 688 689 vmx->nested.force_msr_bitmap_recalc = false; 690 691 return true; 692 } 693 694 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 695 struct vmcs12 *vmcs12) 696 { 697 struct vcpu_vmx *vmx = to_vmx(vcpu); 698 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 699 700 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 701 vmcs12->vmcs_link_pointer == INVALID_GPA) 702 return; 703 704 if (ghc->gpa != vmcs12->vmcs_link_pointer && 705 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 706 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 707 return; 708 709 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 710 VMCS12_SIZE); 711 } 712 713 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 714 struct vmcs12 *vmcs12) 715 { 716 struct vcpu_vmx *vmx = to_vmx(vcpu); 717 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 718 719 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 720 vmcs12->vmcs_link_pointer == INVALID_GPA) 721 return; 722 723 if (ghc->gpa != vmcs12->vmcs_link_pointer && 724 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 725 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 726 return; 727 728 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 729 VMCS12_SIZE); 730 } 731 732 /* 733 * In nested virtualization, check if L1 has set 734 * VM_EXIT_ACK_INTR_ON_EXIT 735 */ 736 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 737 { 738 return get_vmcs12(vcpu)->vm_exit_controls & 739 VM_EXIT_ACK_INTR_ON_EXIT; 740 } 741 742 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 743 struct vmcs12 *vmcs12) 744 { 745 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 746 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 747 return -EINVAL; 748 else 749 return 0; 750 } 751 752 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 753 struct vmcs12 *vmcs12) 754 { 755 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 756 !nested_cpu_has_apic_reg_virt(vmcs12) && 757 !nested_cpu_has_vid(vmcs12) && 758 !nested_cpu_has_posted_intr(vmcs12)) 759 return 0; 760 761 /* 762 * If virtualize x2apic mode is enabled, 763 * virtualize apic access must be disabled. 764 */ 765 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 766 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 767 return -EINVAL; 768 769 /* 770 * If virtual interrupt delivery is enabled, 771 * we must exit on external interrupts. 772 */ 773 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 774 return -EINVAL; 775 776 /* 777 * bits 15:8 should be zero in posted_intr_nv, 778 * the descriptor address has been already checked 779 * in nested_get_vmcs12_pages. 780 * 781 * bits 5:0 of posted_intr_desc_addr should be zero. 782 */ 783 if (nested_cpu_has_posted_intr(vmcs12) && 784 (CC(!nested_cpu_has_vid(vmcs12)) || 785 CC(!nested_exit_intr_ack_set(vcpu)) || 786 CC((vmcs12->posted_intr_nv & 0xff00)) || 787 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 788 return -EINVAL; 789 790 /* tpr shadow is needed by all apicv features. */ 791 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 792 return -EINVAL; 793 794 return 0; 795 } 796 797 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 798 u32 count, u64 addr) 799 { 800 if (count == 0) 801 return 0; 802 803 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 804 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 805 return -EINVAL; 806 807 return 0; 808 } 809 810 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 811 struct vmcs12 *vmcs12) 812 { 813 if (CC(nested_vmx_check_msr_switch(vcpu, 814 vmcs12->vm_exit_msr_load_count, 815 vmcs12->vm_exit_msr_load_addr)) || 816 CC(nested_vmx_check_msr_switch(vcpu, 817 vmcs12->vm_exit_msr_store_count, 818 vmcs12->vm_exit_msr_store_addr))) 819 return -EINVAL; 820 821 return 0; 822 } 823 824 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 825 struct vmcs12 *vmcs12) 826 { 827 if (CC(nested_vmx_check_msr_switch(vcpu, 828 vmcs12->vm_entry_msr_load_count, 829 vmcs12->vm_entry_msr_load_addr))) 830 return -EINVAL; 831 832 return 0; 833 } 834 835 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 836 struct vmcs12 *vmcs12) 837 { 838 if (!nested_cpu_has_pml(vmcs12)) 839 return 0; 840 841 if (CC(!nested_cpu_has_ept(vmcs12)) || 842 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 843 return -EINVAL; 844 845 return 0; 846 } 847 848 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 849 struct vmcs12 *vmcs12) 850 { 851 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 852 !nested_cpu_has_ept(vmcs12))) 853 return -EINVAL; 854 return 0; 855 } 856 857 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 858 struct vmcs12 *vmcs12) 859 { 860 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 861 !nested_cpu_has_ept(vmcs12))) 862 return -EINVAL; 863 return 0; 864 } 865 866 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 867 struct vmcs12 *vmcs12) 868 { 869 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 870 return 0; 871 872 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 873 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 874 return -EINVAL; 875 876 return 0; 877 } 878 879 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 880 struct vmx_msr_entry *e) 881 { 882 /* x2APIC MSR accesses are not allowed */ 883 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 884 return -EINVAL; 885 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 886 CC(e->index == MSR_IA32_UCODE_REV)) 887 return -EINVAL; 888 if (CC(e->reserved != 0)) 889 return -EINVAL; 890 return 0; 891 } 892 893 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 894 struct vmx_msr_entry *e) 895 { 896 if (CC(e->index == MSR_FS_BASE) || 897 CC(e->index == MSR_GS_BASE) || 898 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 899 nested_vmx_msr_check_common(vcpu, e)) 900 return -EINVAL; 901 return 0; 902 } 903 904 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 905 struct vmx_msr_entry *e) 906 { 907 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 908 nested_vmx_msr_check_common(vcpu, e)) 909 return -EINVAL; 910 return 0; 911 } 912 913 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 914 { 915 struct vcpu_vmx *vmx = to_vmx(vcpu); 916 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 917 vmx->nested.msrs.misc_high); 918 919 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 920 } 921 922 /* 923 * Load guest's/host's msr at nested entry/exit. 924 * return 0 for success, entry index for failure. 925 * 926 * One of the failure modes for MSR load/store is when a list exceeds the 927 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 928 * as possible, process all valid entries before failing rather than precheck 929 * for a capacity violation. 930 */ 931 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 932 { 933 u32 i; 934 struct vmx_msr_entry e; 935 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 936 937 for (i = 0; i < count; i++) { 938 if (unlikely(i >= max_msr_list_size)) 939 goto fail; 940 941 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 942 &e, sizeof(e))) { 943 pr_debug_ratelimited( 944 "%s cannot read MSR entry (%u, 0x%08llx)\n", 945 __func__, i, gpa + i * sizeof(e)); 946 goto fail; 947 } 948 if (nested_vmx_load_msr_check(vcpu, &e)) { 949 pr_debug_ratelimited( 950 "%s check failed (%u, 0x%x, 0x%x)\n", 951 __func__, i, e.index, e.reserved); 952 goto fail; 953 } 954 if (kvm_set_msr(vcpu, e.index, e.value)) { 955 pr_debug_ratelimited( 956 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 957 __func__, i, e.index, e.value); 958 goto fail; 959 } 960 } 961 return 0; 962 fail: 963 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 964 return i + 1; 965 } 966 967 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 968 u32 msr_index, 969 u64 *data) 970 { 971 struct vcpu_vmx *vmx = to_vmx(vcpu); 972 973 /* 974 * If the L0 hypervisor stored a more accurate value for the TSC that 975 * does not include the time taken for emulation of the L2->L1 976 * VM-exit in L0, use the more accurate value. 977 */ 978 if (msr_index == MSR_IA32_TSC) { 979 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 980 MSR_IA32_TSC); 981 982 if (i >= 0) { 983 u64 val = vmx->msr_autostore.guest.val[i].value; 984 985 *data = kvm_read_l1_tsc(vcpu, val); 986 return true; 987 } 988 } 989 990 if (kvm_get_msr(vcpu, msr_index, data)) { 991 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 992 msr_index); 993 return false; 994 } 995 return true; 996 } 997 998 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 999 struct vmx_msr_entry *e) 1000 { 1001 if (kvm_vcpu_read_guest(vcpu, 1002 gpa + i * sizeof(*e), 1003 e, 2 * sizeof(u32))) { 1004 pr_debug_ratelimited( 1005 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1006 __func__, i, gpa + i * sizeof(*e)); 1007 return false; 1008 } 1009 if (nested_vmx_store_msr_check(vcpu, e)) { 1010 pr_debug_ratelimited( 1011 "%s check failed (%u, 0x%x, 0x%x)\n", 1012 __func__, i, e->index, e->reserved); 1013 return false; 1014 } 1015 return true; 1016 } 1017 1018 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1019 { 1020 u64 data; 1021 u32 i; 1022 struct vmx_msr_entry e; 1023 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1024 1025 for (i = 0; i < count; i++) { 1026 if (unlikely(i >= max_msr_list_size)) 1027 return -EINVAL; 1028 1029 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1030 return -EINVAL; 1031 1032 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1033 return -EINVAL; 1034 1035 if (kvm_vcpu_write_guest(vcpu, 1036 gpa + i * sizeof(e) + 1037 offsetof(struct vmx_msr_entry, value), 1038 &data, sizeof(data))) { 1039 pr_debug_ratelimited( 1040 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1041 __func__, i, e.index, data); 1042 return -EINVAL; 1043 } 1044 } 1045 return 0; 1046 } 1047 1048 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1049 { 1050 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1051 u32 count = vmcs12->vm_exit_msr_store_count; 1052 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1053 struct vmx_msr_entry e; 1054 u32 i; 1055 1056 for (i = 0; i < count; i++) { 1057 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1058 return false; 1059 1060 if (e.index == msr_index) 1061 return true; 1062 } 1063 return false; 1064 } 1065 1066 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1067 u32 msr_index) 1068 { 1069 struct vcpu_vmx *vmx = to_vmx(vcpu); 1070 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1071 bool in_vmcs12_store_list; 1072 int msr_autostore_slot; 1073 bool in_autostore_list; 1074 int last; 1075 1076 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1077 in_autostore_list = msr_autostore_slot >= 0; 1078 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1079 1080 if (in_vmcs12_store_list && !in_autostore_list) { 1081 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1082 /* 1083 * Emulated VMEntry does not fail here. Instead a less 1084 * accurate value will be returned by 1085 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1086 * instead of reading the value from the vmcs02 VMExit 1087 * MSR-store area. 1088 */ 1089 pr_warn_ratelimited( 1090 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1091 msr_index); 1092 return; 1093 } 1094 last = autostore->nr++; 1095 autostore->val[last].index = msr_index; 1096 } else if (!in_vmcs12_store_list && in_autostore_list) { 1097 last = --autostore->nr; 1098 autostore->val[msr_autostore_slot] = autostore->val[last]; 1099 } 1100 } 1101 1102 /* 1103 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1104 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1105 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1106 * @entry_failure_code. 1107 */ 1108 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1109 bool nested_ept, bool reload_pdptrs, 1110 enum vm_entry_failure_code *entry_failure_code) 1111 { 1112 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1113 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1114 return -EINVAL; 1115 } 1116 1117 /* 1118 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1119 * must not be dereferenced. 1120 */ 1121 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1122 CC(!load_pdptrs(vcpu, cr3))) { 1123 *entry_failure_code = ENTRY_FAIL_PDPTE; 1124 return -EINVAL; 1125 } 1126 1127 vcpu->arch.cr3 = cr3; 1128 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1129 1130 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1131 kvm_init_mmu(vcpu); 1132 1133 if (!nested_ept) 1134 kvm_mmu_new_pgd(vcpu, cr3); 1135 1136 return 0; 1137 } 1138 1139 /* 1140 * Returns if KVM is able to config CPU to tag TLB entries 1141 * populated by L2 differently than TLB entries populated 1142 * by L1. 1143 * 1144 * If L0 uses EPT, L1 and L2 run with different EPTP because 1145 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1146 * are tagged with different EPTP. 1147 * 1148 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1149 * with different VPID (L1 entries are tagged with vmx->vpid 1150 * while L2 entries are tagged with vmx->nested.vpid02). 1151 */ 1152 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1153 { 1154 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1155 1156 return enable_ept || 1157 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1158 } 1159 1160 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1161 struct vmcs12 *vmcs12, 1162 bool is_vmenter) 1163 { 1164 struct vcpu_vmx *vmx = to_vmx(vcpu); 1165 1166 /* 1167 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1168 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1169 * full TLB flush from the guest's perspective. This is required even 1170 * if VPID is disabled in the host as KVM may need to synchronize the 1171 * MMU in response to the guest TLB flush. 1172 * 1173 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1174 * EPT is a special snowflake, as guest-physical mappings aren't 1175 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1176 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1177 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1178 * those mappings. 1179 */ 1180 if (!nested_cpu_has_vpid(vmcs12)) { 1181 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1182 return; 1183 } 1184 1185 /* L2 should never have a VPID if VPID is disabled. */ 1186 WARN_ON(!enable_vpid); 1187 1188 /* 1189 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1190 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1191 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1192 * that the new vpid12 has never been used and thus represents a new 1193 * guest ASID that cannot have entries in the TLB. 1194 */ 1195 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1196 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1197 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1198 return; 1199 } 1200 1201 /* 1202 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1203 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1204 * KVM was unable to allocate a VPID for L2, flush the current context 1205 * as the effective ASID is common to both L1 and L2. 1206 */ 1207 if (!nested_has_guest_tlb_tag(vcpu)) 1208 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1209 } 1210 1211 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1212 { 1213 superset &= mask; 1214 subset &= mask; 1215 1216 return (superset | subset) == superset; 1217 } 1218 1219 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1220 { 1221 const u64 feature_and_reserved = 1222 /* feature (except bit 48; see below) */ 1223 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1224 /* reserved */ 1225 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1226 u64 vmx_basic = vmx->nested.msrs.basic; 1227 1228 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1229 return -EINVAL; 1230 1231 /* 1232 * KVM does not emulate a version of VMX that constrains physical 1233 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1234 */ 1235 if (data & BIT_ULL(48)) 1236 return -EINVAL; 1237 1238 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1239 vmx_basic_vmcs_revision_id(data)) 1240 return -EINVAL; 1241 1242 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1243 return -EINVAL; 1244 1245 vmx->nested.msrs.basic = data; 1246 return 0; 1247 } 1248 1249 static int 1250 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1251 { 1252 u64 supported; 1253 u32 *lowp, *highp; 1254 1255 switch (msr_index) { 1256 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1257 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1258 highp = &vmx->nested.msrs.pinbased_ctls_high; 1259 break; 1260 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1261 lowp = &vmx->nested.msrs.procbased_ctls_low; 1262 highp = &vmx->nested.msrs.procbased_ctls_high; 1263 break; 1264 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1265 lowp = &vmx->nested.msrs.exit_ctls_low; 1266 highp = &vmx->nested.msrs.exit_ctls_high; 1267 break; 1268 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1269 lowp = &vmx->nested.msrs.entry_ctls_low; 1270 highp = &vmx->nested.msrs.entry_ctls_high; 1271 break; 1272 case MSR_IA32_VMX_PROCBASED_CTLS2: 1273 lowp = &vmx->nested.msrs.secondary_ctls_low; 1274 highp = &vmx->nested.msrs.secondary_ctls_high; 1275 break; 1276 default: 1277 BUG(); 1278 } 1279 1280 supported = vmx_control_msr(*lowp, *highp); 1281 1282 /* Check must-be-1 bits are still 1. */ 1283 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1284 return -EINVAL; 1285 1286 /* Check must-be-0 bits are still 0. */ 1287 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1288 return -EINVAL; 1289 1290 *lowp = data; 1291 *highp = data >> 32; 1292 return 0; 1293 } 1294 1295 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1296 { 1297 const u64 feature_and_reserved_bits = 1298 /* feature */ 1299 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1300 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1301 /* reserved */ 1302 GENMASK_ULL(13, 9) | BIT_ULL(31); 1303 u64 vmx_misc; 1304 1305 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1306 vmx->nested.msrs.misc_high); 1307 1308 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1309 return -EINVAL; 1310 1311 if ((vmx->nested.msrs.pinbased_ctls_high & 1312 PIN_BASED_VMX_PREEMPTION_TIMER) && 1313 vmx_misc_preemption_timer_rate(data) != 1314 vmx_misc_preemption_timer_rate(vmx_misc)) 1315 return -EINVAL; 1316 1317 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1318 return -EINVAL; 1319 1320 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1321 return -EINVAL; 1322 1323 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1324 return -EINVAL; 1325 1326 vmx->nested.msrs.misc_low = data; 1327 vmx->nested.msrs.misc_high = data >> 32; 1328 1329 return 0; 1330 } 1331 1332 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1333 { 1334 u64 vmx_ept_vpid_cap; 1335 1336 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1337 vmx->nested.msrs.vpid_caps); 1338 1339 /* Every bit is either reserved or a feature bit. */ 1340 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1341 return -EINVAL; 1342 1343 vmx->nested.msrs.ept_caps = data; 1344 vmx->nested.msrs.vpid_caps = data >> 32; 1345 return 0; 1346 } 1347 1348 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1349 { 1350 u64 *msr; 1351 1352 switch (msr_index) { 1353 case MSR_IA32_VMX_CR0_FIXED0: 1354 msr = &vmx->nested.msrs.cr0_fixed0; 1355 break; 1356 case MSR_IA32_VMX_CR4_FIXED0: 1357 msr = &vmx->nested.msrs.cr4_fixed0; 1358 break; 1359 default: 1360 BUG(); 1361 } 1362 1363 /* 1364 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1365 * must be 1 in the restored value. 1366 */ 1367 if (!is_bitwise_subset(data, *msr, -1ULL)) 1368 return -EINVAL; 1369 1370 *msr = data; 1371 return 0; 1372 } 1373 1374 /* 1375 * Called when userspace is restoring VMX MSRs. 1376 * 1377 * Returns 0 on success, non-0 otherwise. 1378 */ 1379 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1380 { 1381 struct vcpu_vmx *vmx = to_vmx(vcpu); 1382 1383 /* 1384 * Don't allow changes to the VMX capability MSRs while the vCPU 1385 * is in VMX operation. 1386 */ 1387 if (vmx->nested.vmxon) 1388 return -EBUSY; 1389 1390 switch (msr_index) { 1391 case MSR_IA32_VMX_BASIC: 1392 return vmx_restore_vmx_basic(vmx, data); 1393 case MSR_IA32_VMX_PINBASED_CTLS: 1394 case MSR_IA32_VMX_PROCBASED_CTLS: 1395 case MSR_IA32_VMX_EXIT_CTLS: 1396 case MSR_IA32_VMX_ENTRY_CTLS: 1397 /* 1398 * The "non-true" VMX capability MSRs are generated from the 1399 * "true" MSRs, so we do not support restoring them directly. 1400 * 1401 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1402 * should restore the "true" MSRs with the must-be-1 bits 1403 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1404 * DEFAULT SETTINGS". 1405 */ 1406 return -EINVAL; 1407 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1408 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1409 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1410 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1411 case MSR_IA32_VMX_PROCBASED_CTLS2: 1412 return vmx_restore_control_msr(vmx, msr_index, data); 1413 case MSR_IA32_VMX_MISC: 1414 return vmx_restore_vmx_misc(vmx, data); 1415 case MSR_IA32_VMX_CR0_FIXED0: 1416 case MSR_IA32_VMX_CR4_FIXED0: 1417 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1418 case MSR_IA32_VMX_CR0_FIXED1: 1419 case MSR_IA32_VMX_CR4_FIXED1: 1420 /* 1421 * These MSRs are generated based on the vCPU's CPUID, so we 1422 * do not support restoring them directly. 1423 */ 1424 return -EINVAL; 1425 case MSR_IA32_VMX_EPT_VPID_CAP: 1426 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1427 case MSR_IA32_VMX_VMCS_ENUM: 1428 vmx->nested.msrs.vmcs_enum = data; 1429 return 0; 1430 case MSR_IA32_VMX_VMFUNC: 1431 if (data & ~vmx->nested.msrs.vmfunc_controls) 1432 return -EINVAL; 1433 vmx->nested.msrs.vmfunc_controls = data; 1434 return 0; 1435 default: 1436 /* 1437 * The rest of the VMX capability MSRs do not support restore. 1438 */ 1439 return -EINVAL; 1440 } 1441 } 1442 1443 /* Returns 0 on success, non-0 otherwise. */ 1444 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1445 { 1446 switch (msr_index) { 1447 case MSR_IA32_VMX_BASIC: 1448 *pdata = msrs->basic; 1449 break; 1450 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1451 case MSR_IA32_VMX_PINBASED_CTLS: 1452 *pdata = vmx_control_msr( 1453 msrs->pinbased_ctls_low, 1454 msrs->pinbased_ctls_high); 1455 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1456 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1457 break; 1458 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1459 case MSR_IA32_VMX_PROCBASED_CTLS: 1460 *pdata = vmx_control_msr( 1461 msrs->procbased_ctls_low, 1462 msrs->procbased_ctls_high); 1463 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1464 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1465 break; 1466 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1467 case MSR_IA32_VMX_EXIT_CTLS: 1468 *pdata = vmx_control_msr( 1469 msrs->exit_ctls_low, 1470 msrs->exit_ctls_high); 1471 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1472 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1473 break; 1474 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1475 case MSR_IA32_VMX_ENTRY_CTLS: 1476 *pdata = vmx_control_msr( 1477 msrs->entry_ctls_low, 1478 msrs->entry_ctls_high); 1479 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1480 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1481 break; 1482 case MSR_IA32_VMX_MISC: 1483 *pdata = vmx_control_msr( 1484 msrs->misc_low, 1485 msrs->misc_high); 1486 break; 1487 case MSR_IA32_VMX_CR0_FIXED0: 1488 *pdata = msrs->cr0_fixed0; 1489 break; 1490 case MSR_IA32_VMX_CR0_FIXED1: 1491 *pdata = msrs->cr0_fixed1; 1492 break; 1493 case MSR_IA32_VMX_CR4_FIXED0: 1494 *pdata = msrs->cr4_fixed0; 1495 break; 1496 case MSR_IA32_VMX_CR4_FIXED1: 1497 *pdata = msrs->cr4_fixed1; 1498 break; 1499 case MSR_IA32_VMX_VMCS_ENUM: 1500 *pdata = msrs->vmcs_enum; 1501 break; 1502 case MSR_IA32_VMX_PROCBASED_CTLS2: 1503 *pdata = vmx_control_msr( 1504 msrs->secondary_ctls_low, 1505 msrs->secondary_ctls_high); 1506 break; 1507 case MSR_IA32_VMX_EPT_VPID_CAP: 1508 *pdata = msrs->ept_caps | 1509 ((u64)msrs->vpid_caps << 32); 1510 break; 1511 case MSR_IA32_VMX_VMFUNC: 1512 *pdata = msrs->vmfunc_controls; 1513 break; 1514 default: 1515 return 1; 1516 } 1517 1518 return 0; 1519 } 1520 1521 /* 1522 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1523 * been modified by the L1 guest. Note, "writable" in this context means 1524 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1525 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1526 * VM-exit information fields (which are actually writable if the vCPU is 1527 * configured to support "VMWRITE to any supported field in the VMCS"). 1528 */ 1529 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1530 { 1531 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1532 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1533 struct shadow_vmcs_field field; 1534 unsigned long val; 1535 int i; 1536 1537 if (WARN_ON(!shadow_vmcs)) 1538 return; 1539 1540 preempt_disable(); 1541 1542 vmcs_load(shadow_vmcs); 1543 1544 for (i = 0; i < max_shadow_read_write_fields; i++) { 1545 field = shadow_read_write_fields[i]; 1546 val = __vmcs_readl(field.encoding); 1547 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1548 } 1549 1550 vmcs_clear(shadow_vmcs); 1551 vmcs_load(vmx->loaded_vmcs->vmcs); 1552 1553 preempt_enable(); 1554 } 1555 1556 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1557 { 1558 const struct shadow_vmcs_field *fields[] = { 1559 shadow_read_write_fields, 1560 shadow_read_only_fields 1561 }; 1562 const int max_fields[] = { 1563 max_shadow_read_write_fields, 1564 max_shadow_read_only_fields 1565 }; 1566 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1567 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1568 struct shadow_vmcs_field field; 1569 unsigned long val; 1570 int i, q; 1571 1572 if (WARN_ON(!shadow_vmcs)) 1573 return; 1574 1575 vmcs_load(shadow_vmcs); 1576 1577 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1578 for (i = 0; i < max_fields[q]; i++) { 1579 field = fields[q][i]; 1580 val = vmcs12_read_any(vmcs12, field.encoding, 1581 field.offset); 1582 __vmcs_writel(field.encoding, val); 1583 } 1584 } 1585 1586 vmcs_clear(shadow_vmcs); 1587 vmcs_load(vmx->loaded_vmcs->vmcs); 1588 } 1589 1590 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1591 { 1592 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1593 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1594 1595 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1596 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1597 vmcs12->guest_rip = evmcs->guest_rip; 1598 1599 if (unlikely(!(hv_clean_fields & 1600 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1601 vmcs12->guest_rsp = evmcs->guest_rsp; 1602 vmcs12->guest_rflags = evmcs->guest_rflags; 1603 vmcs12->guest_interruptibility_info = 1604 evmcs->guest_interruptibility_info; 1605 } 1606 1607 if (unlikely(!(hv_clean_fields & 1608 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1609 vmcs12->cpu_based_vm_exec_control = 1610 evmcs->cpu_based_vm_exec_control; 1611 } 1612 1613 if (unlikely(!(hv_clean_fields & 1614 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1615 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1616 } 1617 1618 if (unlikely(!(hv_clean_fields & 1619 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1620 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1621 } 1622 1623 if (unlikely(!(hv_clean_fields & 1624 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1625 vmcs12->vm_entry_intr_info_field = 1626 evmcs->vm_entry_intr_info_field; 1627 vmcs12->vm_entry_exception_error_code = 1628 evmcs->vm_entry_exception_error_code; 1629 vmcs12->vm_entry_instruction_len = 1630 evmcs->vm_entry_instruction_len; 1631 } 1632 1633 if (unlikely(!(hv_clean_fields & 1634 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1635 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1636 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1637 vmcs12->host_cr0 = evmcs->host_cr0; 1638 vmcs12->host_cr3 = evmcs->host_cr3; 1639 vmcs12->host_cr4 = evmcs->host_cr4; 1640 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1641 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1642 vmcs12->host_rip = evmcs->host_rip; 1643 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1644 vmcs12->host_es_selector = evmcs->host_es_selector; 1645 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1646 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1647 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1648 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1649 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1650 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1651 } 1652 1653 if (unlikely(!(hv_clean_fields & 1654 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1655 vmcs12->pin_based_vm_exec_control = 1656 evmcs->pin_based_vm_exec_control; 1657 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1658 vmcs12->secondary_vm_exec_control = 1659 evmcs->secondary_vm_exec_control; 1660 } 1661 1662 if (unlikely(!(hv_clean_fields & 1663 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1664 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1665 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1666 } 1667 1668 if (unlikely(!(hv_clean_fields & 1669 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1670 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1671 } 1672 1673 if (unlikely(!(hv_clean_fields & 1674 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1675 vmcs12->guest_es_base = evmcs->guest_es_base; 1676 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1677 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1678 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1679 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1680 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1681 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1682 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1683 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1684 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1685 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1686 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1687 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1688 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1689 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1690 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1691 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1692 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1693 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1694 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1695 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1696 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1697 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1698 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1699 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1700 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1701 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1702 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1703 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1704 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1705 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1706 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1707 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1708 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1709 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1710 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1711 } 1712 1713 if (unlikely(!(hv_clean_fields & 1714 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1715 vmcs12->tsc_offset = evmcs->tsc_offset; 1716 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1717 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1718 } 1719 1720 if (unlikely(!(hv_clean_fields & 1721 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1722 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1723 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1724 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1725 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1726 vmcs12->guest_cr0 = evmcs->guest_cr0; 1727 vmcs12->guest_cr3 = evmcs->guest_cr3; 1728 vmcs12->guest_cr4 = evmcs->guest_cr4; 1729 vmcs12->guest_dr7 = evmcs->guest_dr7; 1730 } 1731 1732 if (unlikely(!(hv_clean_fields & 1733 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1734 vmcs12->host_fs_base = evmcs->host_fs_base; 1735 vmcs12->host_gs_base = evmcs->host_gs_base; 1736 vmcs12->host_tr_base = evmcs->host_tr_base; 1737 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1738 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1739 vmcs12->host_rsp = evmcs->host_rsp; 1740 } 1741 1742 if (unlikely(!(hv_clean_fields & 1743 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1744 vmcs12->ept_pointer = evmcs->ept_pointer; 1745 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1746 } 1747 1748 if (unlikely(!(hv_clean_fields & 1749 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1750 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1751 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1752 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1753 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1754 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1755 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1756 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1757 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1758 vmcs12->guest_pending_dbg_exceptions = 1759 evmcs->guest_pending_dbg_exceptions; 1760 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1761 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1762 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1763 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1764 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1765 } 1766 1767 /* 1768 * Not used? 1769 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1770 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1771 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1772 * vmcs12->page_fault_error_code_mask = 1773 * evmcs->page_fault_error_code_mask; 1774 * vmcs12->page_fault_error_code_match = 1775 * evmcs->page_fault_error_code_match; 1776 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1777 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1778 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1779 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1780 */ 1781 1782 /* 1783 * Read only fields: 1784 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1785 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1786 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1787 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1788 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1789 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1790 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1791 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1792 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1793 * vmcs12->exit_qualification = evmcs->exit_qualification; 1794 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1795 * 1796 * Not present in struct vmcs12: 1797 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1798 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1799 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1800 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1801 */ 1802 1803 return; 1804 } 1805 1806 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1807 { 1808 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1809 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1810 1811 /* 1812 * Should not be changed by KVM: 1813 * 1814 * evmcs->host_es_selector = vmcs12->host_es_selector; 1815 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1816 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1817 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1818 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1819 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1820 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1821 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1822 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1823 * evmcs->host_cr0 = vmcs12->host_cr0; 1824 * evmcs->host_cr3 = vmcs12->host_cr3; 1825 * evmcs->host_cr4 = vmcs12->host_cr4; 1826 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1827 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1828 * evmcs->host_rip = vmcs12->host_rip; 1829 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1830 * evmcs->host_fs_base = vmcs12->host_fs_base; 1831 * evmcs->host_gs_base = vmcs12->host_gs_base; 1832 * evmcs->host_tr_base = vmcs12->host_tr_base; 1833 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1834 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1835 * evmcs->host_rsp = vmcs12->host_rsp; 1836 * sync_vmcs02_to_vmcs12() doesn't read these: 1837 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1838 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1839 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1840 * evmcs->ept_pointer = vmcs12->ept_pointer; 1841 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1842 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1843 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1844 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1845 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1846 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1847 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1848 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1849 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1850 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1851 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1852 * evmcs->page_fault_error_code_mask = 1853 * vmcs12->page_fault_error_code_mask; 1854 * evmcs->page_fault_error_code_match = 1855 * vmcs12->page_fault_error_code_match; 1856 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1857 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1858 * evmcs->tsc_offset = vmcs12->tsc_offset; 1859 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1860 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1861 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1862 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1863 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1864 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1865 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1866 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1867 * 1868 * Not present in struct vmcs12: 1869 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1870 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1871 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1872 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1873 */ 1874 1875 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1876 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1877 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1878 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1879 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1880 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1881 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1882 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1883 1884 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1885 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1886 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1887 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1888 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1889 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1890 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1891 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1892 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1893 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1894 1895 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1896 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1897 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1898 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1899 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1900 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1901 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1902 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1903 1904 evmcs->guest_es_base = vmcs12->guest_es_base; 1905 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1906 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1907 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1908 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1909 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1910 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1911 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1912 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1913 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1914 1915 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1916 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1917 1918 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1919 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1920 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1921 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1922 1923 evmcs->guest_pending_dbg_exceptions = 1924 vmcs12->guest_pending_dbg_exceptions; 1925 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1926 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1927 1928 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1929 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1930 1931 evmcs->guest_cr0 = vmcs12->guest_cr0; 1932 evmcs->guest_cr3 = vmcs12->guest_cr3; 1933 evmcs->guest_cr4 = vmcs12->guest_cr4; 1934 evmcs->guest_dr7 = vmcs12->guest_dr7; 1935 1936 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1937 1938 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1939 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1940 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1941 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1942 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1943 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1944 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1945 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1946 1947 evmcs->exit_qualification = vmcs12->exit_qualification; 1948 1949 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1950 evmcs->guest_rsp = vmcs12->guest_rsp; 1951 evmcs->guest_rflags = vmcs12->guest_rflags; 1952 1953 evmcs->guest_interruptibility_info = 1954 vmcs12->guest_interruptibility_info; 1955 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1956 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1957 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1958 evmcs->vm_entry_exception_error_code = 1959 vmcs12->vm_entry_exception_error_code; 1960 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1961 1962 evmcs->guest_rip = vmcs12->guest_rip; 1963 1964 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1965 1966 return; 1967 } 1968 1969 /* 1970 * This is an equivalent of the nested hypervisor executing the vmptrld 1971 * instruction. 1972 */ 1973 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1974 struct kvm_vcpu *vcpu, bool from_launch) 1975 { 1976 struct vcpu_vmx *vmx = to_vmx(vcpu); 1977 bool evmcs_gpa_changed = false; 1978 u64 evmcs_gpa; 1979 1980 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1981 return EVMPTRLD_DISABLED; 1982 1983 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { 1984 nested_release_evmcs(vcpu); 1985 return EVMPTRLD_DISABLED; 1986 } 1987 1988 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1989 vmx->nested.current_vmptr = INVALID_GPA; 1990 1991 nested_release_evmcs(vcpu); 1992 1993 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1994 &vmx->nested.hv_evmcs_map)) 1995 return EVMPTRLD_ERROR; 1996 1997 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1998 1999 /* 2000 * Currently, KVM only supports eVMCS version 1 2001 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2002 * value to first u32 field of eVMCS which should specify eVMCS 2003 * VersionNumber. 2004 * 2005 * Guest should be aware of supported eVMCS versions by host by 2006 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2007 * expected to set this CPUID leaf according to the value 2008 * returned in vmcs_version from nested_enable_evmcs(). 2009 * 2010 * However, it turns out that Microsoft Hyper-V fails to comply 2011 * to their own invented interface: When Hyper-V use eVMCS, it 2012 * just sets first u32 field of eVMCS to revision_id specified 2013 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2014 * which is one of the supported versions specified in 2015 * CPUID.0x4000000A.EAX[0:15]. 2016 * 2017 * To overcome Hyper-V bug, we accept here either a supported 2018 * eVMCS version or VMCS12 revision_id as valid values for first 2019 * u32 field of eVMCS. 2020 */ 2021 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2022 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2023 nested_release_evmcs(vcpu); 2024 return EVMPTRLD_VMFAIL; 2025 } 2026 2027 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2028 2029 evmcs_gpa_changed = true; 2030 /* 2031 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2032 * reloaded from guest's memory (read only fields, fields not 2033 * present in struct hv_enlightened_vmcs, ...). Make sure there 2034 * are no leftovers. 2035 */ 2036 if (from_launch) { 2037 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2038 memset(vmcs12, 0, sizeof(*vmcs12)); 2039 vmcs12->hdr.revision_id = VMCS12_REVISION; 2040 } 2041 2042 } 2043 2044 /* 2045 * Clean fields data can't be used on VMLAUNCH and when we switch 2046 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2047 */ 2048 if (from_launch || evmcs_gpa_changed) { 2049 vmx->nested.hv_evmcs->hv_clean_fields &= 2050 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2051 2052 vmx->nested.force_msr_bitmap_recalc = true; 2053 } 2054 2055 return EVMPTRLD_SUCCEEDED; 2056 } 2057 2058 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2059 { 2060 struct vcpu_vmx *vmx = to_vmx(vcpu); 2061 2062 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2063 copy_vmcs12_to_enlightened(vmx); 2064 else 2065 copy_vmcs12_to_shadow(vmx); 2066 2067 vmx->nested.need_vmcs12_to_shadow_sync = false; 2068 } 2069 2070 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2071 { 2072 struct vcpu_vmx *vmx = 2073 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2074 2075 vmx->nested.preemption_timer_expired = true; 2076 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2077 kvm_vcpu_kick(&vmx->vcpu); 2078 2079 return HRTIMER_NORESTART; 2080 } 2081 2082 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2083 { 2084 struct vcpu_vmx *vmx = to_vmx(vcpu); 2085 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2086 2087 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2088 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2089 2090 if (!vmx->nested.has_preemption_timer_deadline) { 2091 vmx->nested.preemption_timer_deadline = 2092 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2093 vmx->nested.has_preemption_timer_deadline = true; 2094 } 2095 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2096 } 2097 2098 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2099 u64 preemption_timeout) 2100 { 2101 struct vcpu_vmx *vmx = to_vmx(vcpu); 2102 2103 /* 2104 * A timer value of zero is architecturally guaranteed to cause 2105 * a VMExit prior to executing any instructions in the guest. 2106 */ 2107 if (preemption_timeout == 0) { 2108 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2109 return; 2110 } 2111 2112 if (vcpu->arch.virtual_tsc_khz == 0) 2113 return; 2114 2115 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2116 preemption_timeout *= 1000000; 2117 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2118 hrtimer_start(&vmx->nested.preemption_timer, 2119 ktime_add_ns(ktime_get(), preemption_timeout), 2120 HRTIMER_MODE_ABS_PINNED); 2121 } 2122 2123 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2124 { 2125 if (vmx->nested.nested_run_pending && 2126 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2127 return vmcs12->guest_ia32_efer; 2128 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2129 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2130 else 2131 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2132 } 2133 2134 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2135 { 2136 /* 2137 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2138 * according to L0's settings (vmcs12 is irrelevant here). Host 2139 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2140 * will be set as needed prior to VMLAUNCH/VMRESUME. 2141 */ 2142 if (vmx->nested.vmcs02_initialized) 2143 return; 2144 vmx->nested.vmcs02_initialized = true; 2145 2146 /* 2147 * We don't care what the EPTP value is we just need to guarantee 2148 * it's valid so we don't get a false positive when doing early 2149 * consistency checks. 2150 */ 2151 if (enable_ept && nested_early_check) 2152 vmcs_write64(EPT_POINTER, 2153 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2154 2155 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2156 if (cpu_has_vmx_vmfunc()) 2157 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2158 2159 if (cpu_has_vmx_posted_intr()) 2160 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2161 2162 if (cpu_has_vmx_msr_bitmap()) 2163 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2164 2165 /* 2166 * PML is emulated for L2, but never enabled in hardware as the MMU 2167 * handles A/D emulation. Disabling PML for L2 also avoids having to 2168 * deal with filtering out L2 GPAs from the buffer. 2169 */ 2170 if (enable_pml) { 2171 vmcs_write64(PML_ADDRESS, 0); 2172 vmcs_write16(GUEST_PML_INDEX, -1); 2173 } 2174 2175 if (cpu_has_vmx_encls_vmexit()) 2176 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2177 2178 /* 2179 * Set the MSR load/store lists to match L0's settings. Only the 2180 * addresses are constant (for vmcs02), the counts can change based 2181 * on L2's behavior, e.g. switching to/from long mode. 2182 */ 2183 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2184 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2185 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2186 2187 vmx_set_constant_host_state(vmx); 2188 } 2189 2190 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2191 struct vmcs12 *vmcs12) 2192 { 2193 prepare_vmcs02_constant_state(vmx); 2194 2195 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2196 2197 if (enable_vpid) { 2198 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2199 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2200 else 2201 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2202 } 2203 } 2204 2205 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2206 struct vmcs12 *vmcs12) 2207 { 2208 u32 exec_control; 2209 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2210 2211 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2212 prepare_vmcs02_early_rare(vmx, vmcs12); 2213 2214 /* 2215 * PIN CONTROLS 2216 */ 2217 exec_control = __pin_controls_get(vmcs01); 2218 exec_control |= (vmcs12->pin_based_vm_exec_control & 2219 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2220 2221 /* Posted interrupts setting is only taken from vmcs12. */ 2222 vmx->nested.pi_pending = false; 2223 if (nested_cpu_has_posted_intr(vmcs12)) 2224 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2225 else 2226 exec_control &= ~PIN_BASED_POSTED_INTR; 2227 pin_controls_set(vmx, exec_control); 2228 2229 /* 2230 * EXEC CONTROLS 2231 */ 2232 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2233 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2234 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2235 exec_control &= ~CPU_BASED_TPR_SHADOW; 2236 exec_control |= vmcs12->cpu_based_vm_exec_control; 2237 2238 vmx->nested.l1_tpr_threshold = -1; 2239 if (exec_control & CPU_BASED_TPR_SHADOW) 2240 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2241 #ifdef CONFIG_X86_64 2242 else 2243 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2244 CPU_BASED_CR8_STORE_EXITING; 2245 #endif 2246 2247 /* 2248 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2249 * for I/O port accesses. 2250 */ 2251 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2252 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2253 2254 /* 2255 * This bit will be computed in nested_get_vmcs12_pages, because 2256 * we do not have access to L1's MSR bitmap yet. For now, keep 2257 * the same bit as before, hoping to avoid multiple VMWRITEs that 2258 * only set/clear this bit. 2259 */ 2260 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2261 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2262 2263 exec_controls_set(vmx, exec_control); 2264 2265 /* 2266 * SECONDARY EXEC CONTROLS 2267 */ 2268 if (cpu_has_secondary_exec_ctrls()) { 2269 exec_control = __secondary_exec_controls_get(vmcs01); 2270 2271 /* Take the following fields only from vmcs12 */ 2272 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2273 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2274 SECONDARY_EXEC_ENABLE_INVPCID | 2275 SECONDARY_EXEC_ENABLE_RDTSCP | 2276 SECONDARY_EXEC_XSAVES | 2277 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2278 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2279 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2280 SECONDARY_EXEC_ENABLE_VMFUNC | 2281 SECONDARY_EXEC_TSC_SCALING | 2282 SECONDARY_EXEC_DESC); 2283 2284 if (nested_cpu_has(vmcs12, 2285 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2286 exec_control |= vmcs12->secondary_vm_exec_control; 2287 2288 /* PML is emulated and never enabled in hardware for L2. */ 2289 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2290 2291 /* VMCS shadowing for L2 is emulated for now */ 2292 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2293 2294 /* 2295 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2296 * will not have to rewrite the controls just for this bit. 2297 */ 2298 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2299 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2300 exec_control |= SECONDARY_EXEC_DESC; 2301 2302 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2303 vmcs_write16(GUEST_INTR_STATUS, 2304 vmcs12->guest_intr_status); 2305 2306 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2307 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2308 2309 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2310 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2311 2312 secondary_exec_controls_set(vmx, exec_control); 2313 } 2314 2315 /* 2316 * ENTRY CONTROLS 2317 * 2318 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2319 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2320 * on the related bits (if supported by the CPU) in the hope that 2321 * we can avoid VMWrites during vmx_set_efer(). 2322 */ 2323 exec_control = __vm_entry_controls_get(vmcs01); 2324 exec_control |= vmcs12->vm_entry_controls; 2325 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2326 if (cpu_has_load_ia32_efer()) { 2327 if (guest_efer & EFER_LMA) 2328 exec_control |= VM_ENTRY_IA32E_MODE; 2329 if (guest_efer != host_efer) 2330 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2331 } 2332 vm_entry_controls_set(vmx, exec_control); 2333 2334 /* 2335 * EXIT CONTROLS 2336 * 2337 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2338 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2339 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2340 */ 2341 exec_control = __vm_exit_controls_get(vmcs01); 2342 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2343 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2344 else 2345 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2346 vm_exit_controls_set(vmx, exec_control); 2347 2348 /* 2349 * Interrupt/Exception Fields 2350 */ 2351 if (vmx->nested.nested_run_pending) { 2352 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2353 vmcs12->vm_entry_intr_info_field); 2354 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2355 vmcs12->vm_entry_exception_error_code); 2356 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2357 vmcs12->vm_entry_instruction_len); 2358 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2359 vmcs12->guest_interruptibility_info); 2360 vmx->loaded_vmcs->nmi_known_unmasked = 2361 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2362 } else { 2363 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2364 } 2365 } 2366 2367 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2368 { 2369 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2370 2371 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2372 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2373 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2374 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2375 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2376 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2377 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2378 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2379 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2380 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2381 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2382 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2383 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2384 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2385 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2386 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2387 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2388 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2389 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2390 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2391 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2392 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2393 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2394 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2395 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2396 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2397 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2398 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2399 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2400 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2401 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2402 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2403 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2404 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2405 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2406 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2407 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2408 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2409 2410 vmx->segment_cache.bitmask = 0; 2411 } 2412 2413 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2414 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2415 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2416 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2417 vmcs12->guest_pending_dbg_exceptions); 2418 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2419 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2420 2421 /* 2422 * L1 may access the L2's PDPTR, so save them to construct 2423 * vmcs12 2424 */ 2425 if (enable_ept) { 2426 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2427 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2428 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2429 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2430 } 2431 2432 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2433 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2434 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2435 } 2436 2437 if (nested_cpu_has_xsaves(vmcs12)) 2438 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2439 2440 /* 2441 * Whether page-faults are trapped is determined by a combination of 2442 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2443 * doesn't care about page faults then we should set all of these to 2444 * L1's desires. However, if L0 does care about (some) page faults, it 2445 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2446 * simply ask to exit on each and every L2 page fault. This is done by 2447 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2448 * Note that below we don't need special code to set EB.PF beyond the 2449 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2450 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2451 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2452 */ 2453 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2454 /* 2455 * TODO: if both L0 and L1 need the same MASK and MATCH, 2456 * go ahead and use it? 2457 */ 2458 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2459 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2460 } else { 2461 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2462 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2463 } 2464 2465 if (cpu_has_vmx_apicv()) { 2466 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2467 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2468 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2469 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2470 } 2471 2472 /* 2473 * Make sure the msr_autostore list is up to date before we set the 2474 * count in the vmcs02. 2475 */ 2476 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2477 2478 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2479 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2480 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2481 2482 set_cr4_guest_host_mask(vmx); 2483 } 2484 2485 /* 2486 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2487 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2488 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2489 * guest in a way that will both be appropriate to L1's requests, and our 2490 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2491 * function also has additional necessary side-effects, like setting various 2492 * vcpu->arch fields. 2493 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2494 * is assigned to entry_failure_code on failure. 2495 */ 2496 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2497 bool from_vmentry, 2498 enum vm_entry_failure_code *entry_failure_code) 2499 { 2500 struct vcpu_vmx *vmx = to_vmx(vcpu); 2501 bool load_guest_pdptrs_vmcs12 = false; 2502 2503 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 2504 prepare_vmcs02_rare(vmx, vmcs12); 2505 vmx->nested.dirty_vmcs12 = false; 2506 2507 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || 2508 !(vmx->nested.hv_evmcs->hv_clean_fields & 2509 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2510 } 2511 2512 if (vmx->nested.nested_run_pending && 2513 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2514 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2515 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2516 } else { 2517 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2518 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2519 } 2520 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2521 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2522 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2523 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2524 2525 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2526 * bitwise-or of what L1 wants to trap for L2, and what we want to 2527 * trap. Note that CR0.TS also needs updating - we do this later. 2528 */ 2529 vmx_update_exception_bitmap(vcpu); 2530 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2531 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2532 2533 if (vmx->nested.nested_run_pending && 2534 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2535 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2536 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2537 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2538 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2539 } 2540 2541 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2542 vcpu->arch.l1_tsc_offset, 2543 vmx_get_l2_tsc_offset(vcpu), 2544 vmx_get_l2_tsc_multiplier(vcpu)); 2545 2546 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2547 vcpu->arch.l1_tsc_scaling_ratio, 2548 vmx_get_l2_tsc_multiplier(vcpu)); 2549 2550 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2551 if (kvm_has_tsc_control) 2552 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2553 2554 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2555 2556 if (nested_cpu_has_ept(vmcs12)) 2557 nested_ept_init_mmu_context(vcpu); 2558 2559 /* 2560 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2561 * bits which we consider mandatory enabled. 2562 * The CR0_READ_SHADOW is what L2 should have expected to read given 2563 * the specifications by L1; It's not enough to take 2564 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2565 * have more bits than L1 expected. 2566 */ 2567 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2568 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2569 2570 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2571 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2572 2573 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2574 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2575 vmx_set_efer(vcpu, vcpu->arch.efer); 2576 2577 /* 2578 * Guest state is invalid and unrestricted guest is disabled, 2579 * which means L1 attempted VMEntry to L2 with invalid state. 2580 * Fail the VMEntry. 2581 * 2582 * However when force loading the guest state (SMM exit or 2583 * loading nested state after migration, it is possible to 2584 * have invalid guest state now, which will be later fixed by 2585 * restoring L2 register state 2586 */ 2587 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2588 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2589 return -EINVAL; 2590 } 2591 2592 /* Shadow page tables on either EPT or shadow page tables. */ 2593 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2594 from_vmentry, entry_failure_code)) 2595 return -EINVAL; 2596 2597 /* 2598 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2599 * on nested VM-Exit, which can occur without actually running L2 and 2600 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2601 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2602 * transition to HLT instead of running L2. 2603 */ 2604 if (enable_ept) 2605 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2606 2607 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2608 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2609 is_pae_paging(vcpu)) { 2610 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2611 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2612 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2613 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2614 } 2615 2616 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2617 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2618 vmcs12->guest_ia32_perf_global_ctrl))) { 2619 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2620 return -EINVAL; 2621 } 2622 2623 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2624 kvm_rip_write(vcpu, vmcs12->guest_rip); 2625 2626 /* 2627 * It was observed that genuine Hyper-V running in L1 doesn't reset 2628 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2629 * bits when it changes a field in eVMCS. Mark all fields as clean 2630 * here. 2631 */ 2632 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2633 vmx->nested.hv_evmcs->hv_clean_fields |= 2634 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2635 2636 return 0; 2637 } 2638 2639 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2640 { 2641 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2642 nested_cpu_has_virtual_nmis(vmcs12))) 2643 return -EINVAL; 2644 2645 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2646 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2647 return -EINVAL; 2648 2649 return 0; 2650 } 2651 2652 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2653 { 2654 struct vcpu_vmx *vmx = to_vmx(vcpu); 2655 2656 /* Check for memory type validity */ 2657 switch (new_eptp & VMX_EPTP_MT_MASK) { 2658 case VMX_EPTP_MT_UC: 2659 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2660 return false; 2661 break; 2662 case VMX_EPTP_MT_WB: 2663 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2664 return false; 2665 break; 2666 default: 2667 return false; 2668 } 2669 2670 /* Page-walk levels validity. */ 2671 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2672 case VMX_EPTP_PWL_5: 2673 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2674 return false; 2675 break; 2676 case VMX_EPTP_PWL_4: 2677 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2678 return false; 2679 break; 2680 default: 2681 return false; 2682 } 2683 2684 /* Reserved bits should not be set */ 2685 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2686 return false; 2687 2688 /* AD, if set, should be supported */ 2689 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2690 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2691 return false; 2692 } 2693 2694 return true; 2695 } 2696 2697 /* 2698 * Checks related to VM-Execution Control Fields 2699 */ 2700 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2701 struct vmcs12 *vmcs12) 2702 { 2703 struct vcpu_vmx *vmx = to_vmx(vcpu); 2704 2705 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2706 vmx->nested.msrs.pinbased_ctls_low, 2707 vmx->nested.msrs.pinbased_ctls_high)) || 2708 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2709 vmx->nested.msrs.procbased_ctls_low, 2710 vmx->nested.msrs.procbased_ctls_high))) 2711 return -EINVAL; 2712 2713 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2714 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2715 vmx->nested.msrs.secondary_ctls_low, 2716 vmx->nested.msrs.secondary_ctls_high))) 2717 return -EINVAL; 2718 2719 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2720 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2721 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2722 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2723 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2724 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2725 nested_vmx_check_nmi_controls(vmcs12) || 2726 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2727 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2728 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2729 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2730 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2731 return -EINVAL; 2732 2733 if (!nested_cpu_has_preemption_timer(vmcs12) && 2734 nested_cpu_has_save_preemption_timer(vmcs12)) 2735 return -EINVAL; 2736 2737 if (nested_cpu_has_ept(vmcs12) && 2738 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2739 return -EINVAL; 2740 2741 if (nested_cpu_has_vmfunc(vmcs12)) { 2742 if (CC(vmcs12->vm_function_control & 2743 ~vmx->nested.msrs.vmfunc_controls)) 2744 return -EINVAL; 2745 2746 if (nested_cpu_has_eptp_switching(vmcs12)) { 2747 if (CC(!nested_cpu_has_ept(vmcs12)) || 2748 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2749 return -EINVAL; 2750 } 2751 } 2752 2753 return 0; 2754 } 2755 2756 /* 2757 * Checks related to VM-Exit Control Fields 2758 */ 2759 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2760 struct vmcs12 *vmcs12) 2761 { 2762 struct vcpu_vmx *vmx = to_vmx(vcpu); 2763 2764 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2765 vmx->nested.msrs.exit_ctls_low, 2766 vmx->nested.msrs.exit_ctls_high)) || 2767 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2768 return -EINVAL; 2769 2770 return 0; 2771 } 2772 2773 /* 2774 * Checks related to VM-Entry Control Fields 2775 */ 2776 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2777 struct vmcs12 *vmcs12) 2778 { 2779 struct vcpu_vmx *vmx = to_vmx(vcpu); 2780 2781 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2782 vmx->nested.msrs.entry_ctls_low, 2783 vmx->nested.msrs.entry_ctls_high))) 2784 return -EINVAL; 2785 2786 /* 2787 * From the Intel SDM, volume 3: 2788 * Fields relevant to VM-entry event injection must be set properly. 2789 * These fields are the VM-entry interruption-information field, the 2790 * VM-entry exception error code, and the VM-entry instruction length. 2791 */ 2792 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2793 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2794 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2795 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2796 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2797 bool should_have_error_code; 2798 bool urg = nested_cpu_has2(vmcs12, 2799 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2800 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2801 2802 /* VM-entry interruption-info field: interruption type */ 2803 if (CC(intr_type == INTR_TYPE_RESERVED) || 2804 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2805 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2806 return -EINVAL; 2807 2808 /* VM-entry interruption-info field: vector */ 2809 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2810 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2811 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2812 return -EINVAL; 2813 2814 /* VM-entry interruption-info field: deliver error code */ 2815 should_have_error_code = 2816 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2817 x86_exception_has_error_code(vector); 2818 if (CC(has_error_code != should_have_error_code)) 2819 return -EINVAL; 2820 2821 /* VM-entry exception error code */ 2822 if (CC(has_error_code && 2823 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2824 return -EINVAL; 2825 2826 /* VM-entry interruption-info field: reserved bits */ 2827 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2828 return -EINVAL; 2829 2830 /* VM-entry instruction length */ 2831 switch (intr_type) { 2832 case INTR_TYPE_SOFT_EXCEPTION: 2833 case INTR_TYPE_SOFT_INTR: 2834 case INTR_TYPE_PRIV_SW_EXCEPTION: 2835 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2836 CC(vmcs12->vm_entry_instruction_len == 0 && 2837 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2838 return -EINVAL; 2839 } 2840 } 2841 2842 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2843 return -EINVAL; 2844 2845 return 0; 2846 } 2847 2848 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2849 struct vmcs12 *vmcs12) 2850 { 2851 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2852 nested_check_vm_exit_controls(vcpu, vmcs12) || 2853 nested_check_vm_entry_controls(vcpu, vmcs12)) 2854 return -EINVAL; 2855 2856 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) 2857 return nested_evmcs_check_controls(vmcs12); 2858 2859 return 0; 2860 } 2861 2862 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2863 struct vmcs12 *vmcs12) 2864 { 2865 #ifdef CONFIG_X86_64 2866 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2867 !!(vcpu->arch.efer & EFER_LMA))) 2868 return -EINVAL; 2869 #endif 2870 return 0; 2871 } 2872 2873 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2874 struct vmcs12 *vmcs12) 2875 { 2876 bool ia32e; 2877 2878 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2879 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2880 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2881 return -EINVAL; 2882 2883 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2884 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2885 return -EINVAL; 2886 2887 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2888 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2889 return -EINVAL; 2890 2891 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2892 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2893 vmcs12->host_ia32_perf_global_ctrl))) 2894 return -EINVAL; 2895 2896 #ifdef CONFIG_X86_64 2897 ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2898 #else 2899 ia32e = false; 2900 #endif 2901 2902 if (ia32e) { 2903 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2904 return -EINVAL; 2905 } else { 2906 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2907 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2908 CC((vmcs12->host_rip) >> 32)) 2909 return -EINVAL; 2910 } 2911 2912 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2913 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2914 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2915 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2916 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2917 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2918 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2919 CC(vmcs12->host_cs_selector == 0) || 2920 CC(vmcs12->host_tr_selector == 0) || 2921 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2922 return -EINVAL; 2923 2924 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2925 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2926 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2927 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2928 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2929 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2930 return -EINVAL; 2931 2932 /* 2933 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2934 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2935 * the values of the LMA and LME bits in the field must each be that of 2936 * the host address-space size VM-exit control. 2937 */ 2938 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2939 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2940 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2941 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2942 return -EINVAL; 2943 } 2944 2945 return 0; 2946 } 2947 2948 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2949 struct vmcs12 *vmcs12) 2950 { 2951 struct vcpu_vmx *vmx = to_vmx(vcpu); 2952 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 2953 struct vmcs_hdr hdr; 2954 2955 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 2956 return 0; 2957 2958 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2959 return -EINVAL; 2960 2961 if (ghc->gpa != vmcs12->vmcs_link_pointer && 2962 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 2963 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 2964 return -EINVAL; 2965 2966 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 2967 offsetof(struct vmcs12, hdr), 2968 sizeof(hdr)))) 2969 return -EINVAL; 2970 2971 if (CC(hdr.revision_id != VMCS12_REVISION) || 2972 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2973 return -EINVAL; 2974 2975 return 0; 2976 } 2977 2978 /* 2979 * Checks related to Guest Non-register State 2980 */ 2981 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2982 { 2983 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2984 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 2985 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 2986 return -EINVAL; 2987 2988 return 0; 2989 } 2990 2991 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2992 struct vmcs12 *vmcs12, 2993 enum vm_entry_failure_code *entry_failure_code) 2994 { 2995 bool ia32e; 2996 2997 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2998 2999 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3000 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3001 return -EINVAL; 3002 3003 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3004 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3005 return -EINVAL; 3006 3007 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3008 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3009 return -EINVAL; 3010 3011 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3012 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3013 return -EINVAL; 3014 } 3015 3016 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3017 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3018 vmcs12->guest_ia32_perf_global_ctrl))) 3019 return -EINVAL; 3020 3021 /* 3022 * If the load IA32_EFER VM-entry control is 1, the following checks 3023 * are performed on the field for the IA32_EFER MSR: 3024 * - Bits reserved in the IA32_EFER MSR must be 0. 3025 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3026 * the IA-32e mode guest VM-exit control. It must also be identical 3027 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3028 * CR0.PG) is 1. 3029 */ 3030 if (to_vmx(vcpu)->nested.nested_run_pending && 3031 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3032 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 3033 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3034 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3035 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3036 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3037 return -EINVAL; 3038 } 3039 3040 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3041 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3042 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3043 return -EINVAL; 3044 3045 if (nested_check_guest_non_reg_state(vmcs12)) 3046 return -EINVAL; 3047 3048 return 0; 3049 } 3050 3051 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3052 { 3053 struct vcpu_vmx *vmx = to_vmx(vcpu); 3054 unsigned long cr3, cr4; 3055 bool vm_fail; 3056 3057 if (!nested_early_check) 3058 return 0; 3059 3060 if (vmx->msr_autoload.host.nr) 3061 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3062 if (vmx->msr_autoload.guest.nr) 3063 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3064 3065 preempt_disable(); 3066 3067 vmx_prepare_switch_to_guest(vcpu); 3068 3069 /* 3070 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3071 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3072 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3073 * there is no need to preserve other bits or save/restore the field. 3074 */ 3075 vmcs_writel(GUEST_RFLAGS, 0); 3076 3077 cr3 = __get_current_cr3_fast(); 3078 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3079 vmcs_writel(HOST_CR3, cr3); 3080 vmx->loaded_vmcs->host_state.cr3 = cr3; 3081 } 3082 3083 cr4 = cr4_read_shadow(); 3084 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3085 vmcs_writel(HOST_CR4, cr4); 3086 vmx->loaded_vmcs->host_state.cr4 = cr4; 3087 } 3088 3089 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3090 vmx->loaded_vmcs->launched); 3091 3092 if (vmx->msr_autoload.host.nr) 3093 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3094 if (vmx->msr_autoload.guest.nr) 3095 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3096 3097 if (vm_fail) { 3098 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3099 3100 preempt_enable(); 3101 3102 trace_kvm_nested_vmenter_failed( 3103 "early hardware check VM-instruction error: ", error); 3104 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3105 return 1; 3106 } 3107 3108 /* 3109 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3110 */ 3111 if (hw_breakpoint_active()) 3112 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3113 local_irq_enable(); 3114 preempt_enable(); 3115 3116 /* 3117 * A non-failing VMEntry means we somehow entered guest mode with 3118 * an illegal RIP, and that's just the tip of the iceberg. There 3119 * is no telling what memory has been modified or what state has 3120 * been exposed to unknown code. Hitting this all but guarantees 3121 * a (very critical) hardware issue. 3122 */ 3123 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3124 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3125 3126 return 0; 3127 } 3128 3129 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3130 { 3131 struct vcpu_vmx *vmx = to_vmx(vcpu); 3132 3133 /* 3134 * hv_evmcs may end up being not mapped after migration (when 3135 * L2 was running), map it here to make sure vmcs12 changes are 3136 * properly reflected. 3137 */ 3138 if (vmx->nested.enlightened_vmcs_enabled && 3139 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3140 enum nested_evmptrld_status evmptrld_status = 3141 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3142 3143 if (evmptrld_status == EVMPTRLD_VMFAIL || 3144 evmptrld_status == EVMPTRLD_ERROR) 3145 return false; 3146 3147 /* 3148 * Post migration VMCS12 always provides the most actual 3149 * information, copy it to eVMCS upon entry. 3150 */ 3151 vmx->nested.need_vmcs12_to_shadow_sync = true; 3152 } 3153 3154 return true; 3155 } 3156 3157 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3158 { 3159 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3160 struct vcpu_vmx *vmx = to_vmx(vcpu); 3161 struct kvm_host_map *map; 3162 struct page *page; 3163 u64 hpa; 3164 3165 if (!vcpu->arch.pdptrs_from_userspace && 3166 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3167 /* 3168 * Reload the guest's PDPTRs since after a migration 3169 * the guest CR3 might be restored prior to setting the nested 3170 * state which can lead to a load of wrong PDPTRs. 3171 */ 3172 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3173 return false; 3174 } 3175 3176 3177 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3178 /* 3179 * Translate L1 physical address to host physical 3180 * address for vmcs02. Keep the page pinned, so this 3181 * physical address remains valid. We keep a reference 3182 * to it so we can release it later. 3183 */ 3184 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 3185 kvm_release_page_clean(vmx->nested.apic_access_page); 3186 vmx->nested.apic_access_page = NULL; 3187 } 3188 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 3189 if (!is_error_page(page)) { 3190 vmx->nested.apic_access_page = page; 3191 hpa = page_to_phys(vmx->nested.apic_access_page); 3192 vmcs_write64(APIC_ACCESS_ADDR, hpa); 3193 } else { 3194 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 3195 __func__); 3196 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3197 vcpu->run->internal.suberror = 3198 KVM_INTERNAL_ERROR_EMULATION; 3199 vcpu->run->internal.ndata = 0; 3200 return false; 3201 } 3202 } 3203 3204 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3205 map = &vmx->nested.virtual_apic_map; 3206 3207 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3208 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3209 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3210 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3211 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3212 /* 3213 * The processor will never use the TPR shadow, simply 3214 * clear the bit from the execution control. Such a 3215 * configuration is useless, but it happens in tests. 3216 * For any other configuration, failing the vm entry is 3217 * _not_ what the processor does but it's basically the 3218 * only possibility we have. 3219 */ 3220 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3221 } else { 3222 /* 3223 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3224 * force VM-Entry to fail. 3225 */ 3226 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3227 } 3228 } 3229 3230 if (nested_cpu_has_posted_intr(vmcs12)) { 3231 map = &vmx->nested.pi_desc_map; 3232 3233 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3234 vmx->nested.pi_desc = 3235 (struct pi_desc *)(((void *)map->hva) + 3236 offset_in_page(vmcs12->posted_intr_desc_addr)); 3237 vmcs_write64(POSTED_INTR_DESC_ADDR, 3238 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3239 } else { 3240 /* 3241 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3242 * access the contents of the VMCS12 posted interrupt 3243 * descriptor. (Note that KVM may do this when it 3244 * should not, per the architectural specification.) 3245 */ 3246 vmx->nested.pi_desc = NULL; 3247 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3248 } 3249 } 3250 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3251 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3252 else 3253 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3254 3255 return true; 3256 } 3257 3258 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3259 { 3260 if (!nested_get_evmcs_page(vcpu)) { 3261 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3262 __func__); 3263 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3264 vcpu->run->internal.suberror = 3265 KVM_INTERNAL_ERROR_EMULATION; 3266 vcpu->run->internal.ndata = 0; 3267 3268 return false; 3269 } 3270 3271 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3272 return false; 3273 3274 return true; 3275 } 3276 3277 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3278 { 3279 struct vmcs12 *vmcs12; 3280 struct vcpu_vmx *vmx = to_vmx(vcpu); 3281 gpa_t dst; 3282 3283 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3284 return 0; 3285 3286 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3287 return 1; 3288 3289 /* 3290 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3291 * set is already checked as part of A/D emulation. 3292 */ 3293 vmcs12 = get_vmcs12(vcpu); 3294 if (!nested_cpu_has_pml(vmcs12)) 3295 return 0; 3296 3297 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3298 vmx->nested.pml_full = true; 3299 return 1; 3300 } 3301 3302 gpa &= ~0xFFFull; 3303 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3304 3305 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3306 offset_in_page(dst), sizeof(gpa))) 3307 return 0; 3308 3309 vmcs12->guest_pml_index--; 3310 3311 return 0; 3312 } 3313 3314 /* 3315 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3316 * for running VMX instructions (except VMXON, whose prerequisites are 3317 * slightly different). It also specifies what exception to inject otherwise. 3318 * Note that many of these exceptions have priority over VM exits, so they 3319 * don't have to be checked again here. 3320 */ 3321 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3322 { 3323 if (!to_vmx(vcpu)->nested.vmxon) { 3324 kvm_queue_exception(vcpu, UD_VECTOR); 3325 return 0; 3326 } 3327 3328 if (vmx_get_cpl(vcpu)) { 3329 kvm_inject_gp(vcpu, 0); 3330 return 0; 3331 } 3332 3333 return 1; 3334 } 3335 3336 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3337 { 3338 u8 rvi = vmx_get_rvi(); 3339 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3340 3341 return ((rvi & 0xf0) > (vppr & 0xf0)); 3342 } 3343 3344 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3345 struct vmcs12 *vmcs12); 3346 3347 /* 3348 * If from_vmentry is false, this is being called from state restore (either RSM 3349 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3350 * 3351 * Returns: 3352 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3353 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3354 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3355 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3356 */ 3357 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3358 bool from_vmentry) 3359 { 3360 struct vcpu_vmx *vmx = to_vmx(vcpu); 3361 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3362 enum vm_entry_failure_code entry_failure_code; 3363 bool evaluate_pending_interrupts; 3364 union vmx_exit_reason exit_reason = { 3365 .basic = EXIT_REASON_INVALID_STATE, 3366 .failed_vmentry = 1, 3367 }; 3368 u32 failed_index; 3369 3370 kvm_service_local_tlb_flush_requests(vcpu); 3371 3372 evaluate_pending_interrupts = exec_controls_get(vmx) & 3373 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3374 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3375 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3376 3377 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3378 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3379 if (kvm_mpx_supported() && 3380 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3381 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3382 3383 /* 3384 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3385 * nested early checks are disabled. In the event of a "late" VM-Fail, 3386 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3387 * software model to the pre-VMEntry host state. When EPT is disabled, 3388 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3389 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3390 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3391 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3392 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3393 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3394 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3395 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3396 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3397 * path would need to manually save/restore vmcs01.GUEST_CR3. 3398 */ 3399 if (!enable_ept && !nested_early_check) 3400 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3401 3402 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3403 3404 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3405 3406 if (from_vmentry) { 3407 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3408 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3409 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3410 } 3411 3412 if (nested_vmx_check_vmentry_hw(vcpu)) { 3413 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3414 return NVMX_VMENTRY_VMFAIL; 3415 } 3416 3417 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3418 &entry_failure_code)) { 3419 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3420 vmcs12->exit_qualification = entry_failure_code; 3421 goto vmentry_fail_vmexit; 3422 } 3423 } 3424 3425 enter_guest_mode(vcpu); 3426 3427 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3428 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3429 vmcs12->exit_qualification = entry_failure_code; 3430 goto vmentry_fail_vmexit_guest_mode; 3431 } 3432 3433 if (from_vmentry) { 3434 failed_index = nested_vmx_load_msr(vcpu, 3435 vmcs12->vm_entry_msr_load_addr, 3436 vmcs12->vm_entry_msr_load_count); 3437 if (failed_index) { 3438 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3439 vmcs12->exit_qualification = failed_index; 3440 goto vmentry_fail_vmexit_guest_mode; 3441 } 3442 } else { 3443 /* 3444 * The MMU is not initialized to point at the right entities yet and 3445 * "get pages" would need to read data from the guest (i.e. we will 3446 * need to perform gpa to hpa translation). Request a call 3447 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3448 * have already been set at vmentry time and should not be reset. 3449 */ 3450 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3451 } 3452 3453 /* 3454 * If L1 had a pending IRQ/NMI until it executed 3455 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3456 * disallowed (e.g. interrupts disabled), L0 needs to 3457 * evaluate if this pending event should cause an exit from L2 3458 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3459 * intercept EXTERNAL_INTERRUPT). 3460 * 3461 * Usually this would be handled by the processor noticing an 3462 * IRQ/NMI window request, or checking RVI during evaluation of 3463 * pending virtual interrupts. However, this setting was done 3464 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3465 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3466 */ 3467 if (unlikely(evaluate_pending_interrupts)) 3468 kvm_make_request(KVM_REQ_EVENT, vcpu); 3469 3470 /* 3471 * Do not start the preemption timer hrtimer until after we know 3472 * we are successful, so that only nested_vmx_vmexit needs to cancel 3473 * the timer. 3474 */ 3475 vmx->nested.preemption_timer_expired = false; 3476 if (nested_cpu_has_preemption_timer(vmcs12)) { 3477 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3478 vmx_start_preemption_timer(vcpu, timer_value); 3479 } 3480 3481 /* 3482 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3483 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3484 * returned as far as L1 is concerned. It will only return (and set 3485 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3486 */ 3487 return NVMX_VMENTRY_SUCCESS; 3488 3489 /* 3490 * A failed consistency check that leads to a VMExit during L1's 3491 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3492 * 26.7 "VM-entry failures during or after loading guest state". 3493 */ 3494 vmentry_fail_vmexit_guest_mode: 3495 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3496 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3497 leave_guest_mode(vcpu); 3498 3499 vmentry_fail_vmexit: 3500 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3501 3502 if (!from_vmentry) 3503 return NVMX_VMENTRY_VMEXIT; 3504 3505 load_vmcs12_host_state(vcpu, vmcs12); 3506 vmcs12->vm_exit_reason = exit_reason.full; 3507 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 3508 vmx->nested.need_vmcs12_to_shadow_sync = true; 3509 return NVMX_VMENTRY_VMEXIT; 3510 } 3511 3512 /* 3513 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3514 * for running an L2 nested guest. 3515 */ 3516 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3517 { 3518 struct vmcs12 *vmcs12; 3519 enum nvmx_vmentry_status status; 3520 struct vcpu_vmx *vmx = to_vmx(vcpu); 3521 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3522 enum nested_evmptrld_status evmptrld_status; 3523 3524 if (!nested_vmx_check_permission(vcpu)) 3525 return 1; 3526 3527 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3528 if (evmptrld_status == EVMPTRLD_ERROR) { 3529 kvm_queue_exception(vcpu, UD_VECTOR); 3530 return 1; 3531 } 3532 3533 kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 3534 3535 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3536 return nested_vmx_failInvalid(vcpu); 3537 3538 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && 3539 vmx->nested.current_vmptr == INVALID_GPA)) 3540 return nested_vmx_failInvalid(vcpu); 3541 3542 vmcs12 = get_vmcs12(vcpu); 3543 3544 /* 3545 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3546 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3547 * rather than RFLAGS.ZF, and no error number is stored to the 3548 * VM-instruction error field. 3549 */ 3550 if (CC(vmcs12->hdr.shadow_vmcs)) 3551 return nested_vmx_failInvalid(vcpu); 3552 3553 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 3554 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); 3555 /* Enlightened VMCS doesn't have launch state */ 3556 vmcs12->launch_state = !launch; 3557 } else if (enable_shadow_vmcs) { 3558 copy_shadow_to_vmcs12(vmx); 3559 } 3560 3561 /* 3562 * The nested entry process starts with enforcing various prerequisites 3563 * on vmcs12 as required by the Intel SDM, and act appropriately when 3564 * they fail: As the SDM explains, some conditions should cause the 3565 * instruction to fail, while others will cause the instruction to seem 3566 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3567 * To speed up the normal (success) code path, we should avoid checking 3568 * for misconfigurations which will anyway be caught by the processor 3569 * when using the merged vmcs02. 3570 */ 3571 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3572 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3573 3574 if (CC(vmcs12->launch_state == launch)) 3575 return nested_vmx_fail(vcpu, 3576 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3577 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3578 3579 if (nested_vmx_check_controls(vcpu, vmcs12)) 3580 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3581 3582 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3583 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3584 3585 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3586 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3587 3588 /* 3589 * We're finally done with prerequisite checking, and can start with 3590 * the nested entry. 3591 */ 3592 vmx->nested.nested_run_pending = 1; 3593 vmx->nested.has_preemption_timer_deadline = false; 3594 status = nested_vmx_enter_non_root_mode(vcpu, true); 3595 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3596 goto vmentry_failed; 3597 3598 /* Emulate processing of posted interrupts on VM-Enter. */ 3599 if (nested_cpu_has_posted_intr(vmcs12) && 3600 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3601 vmx->nested.pi_pending = true; 3602 kvm_make_request(KVM_REQ_EVENT, vcpu); 3603 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3604 } 3605 3606 /* Hide L1D cache contents from the nested guest. */ 3607 vmx->vcpu.arch.l1tf_flush_l1d = true; 3608 3609 /* 3610 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3611 * also be used as part of restoring nVMX state for 3612 * snapshot restore (migration). 3613 * 3614 * In this flow, it is assumed that vmcs12 cache was 3615 * transferred as part of captured nVMX state and should 3616 * therefore not be read from guest memory (which may not 3617 * exist on destination host yet). 3618 */ 3619 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3620 3621 switch (vmcs12->guest_activity_state) { 3622 case GUEST_ACTIVITY_HLT: 3623 /* 3624 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3625 * awakened by event injection or by an NMI-window VM-exit or 3626 * by an interrupt-window VM-exit, halt the vcpu. 3627 */ 3628 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3629 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3630 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3631 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3632 vmx->nested.nested_run_pending = 0; 3633 return kvm_emulate_halt_noskip(vcpu); 3634 } 3635 break; 3636 case GUEST_ACTIVITY_WAIT_SIPI: 3637 vmx->nested.nested_run_pending = 0; 3638 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3639 break; 3640 default: 3641 break; 3642 } 3643 3644 return 1; 3645 3646 vmentry_failed: 3647 vmx->nested.nested_run_pending = 0; 3648 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3649 return 0; 3650 if (status == NVMX_VMENTRY_VMEXIT) 3651 return 1; 3652 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3653 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3654 } 3655 3656 /* 3657 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3658 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3659 * This function returns the new value we should put in vmcs12.guest_cr0. 3660 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3661 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3662 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3663 * didn't trap the bit, because if L1 did, so would L0). 3664 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3665 * been modified by L2, and L1 knows it. So just leave the old value of 3666 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3667 * isn't relevant, because if L0 traps this bit it can set it to anything. 3668 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3669 * changed these bits, and therefore they need to be updated, but L0 3670 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3671 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3672 */ 3673 static inline unsigned long 3674 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3675 { 3676 return 3677 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3678 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3679 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3680 vcpu->arch.cr0_guest_owned_bits)); 3681 } 3682 3683 static inline unsigned long 3684 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3685 { 3686 return 3687 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3688 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3689 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3690 vcpu->arch.cr4_guest_owned_bits)); 3691 } 3692 3693 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3694 struct vmcs12 *vmcs12, 3695 u32 vm_exit_reason, u32 exit_intr_info) 3696 { 3697 u32 idt_vectoring; 3698 unsigned int nr; 3699 3700 /* 3701 * Per the SDM, VM-Exits due to double and triple faults are never 3702 * considered to occur during event delivery, even if the double/triple 3703 * fault is the result of an escalating vectoring issue. 3704 * 3705 * Note, the SDM qualifies the double fault behavior with "The original 3706 * event results in a double-fault exception". It's unclear why the 3707 * qualification exists since exits due to double fault can occur only 3708 * while vectoring a different exception (injected events are never 3709 * subject to interception), i.e. there's _always_ an original event. 3710 * 3711 * The SDM also uses NMI as a confusing example for the "original event 3712 * causes the VM exit directly" clause. NMI isn't special in any way, 3713 * the same rule applies to all events that cause an exit directly. 3714 * NMI is an odd choice for the example because NMIs can only occur on 3715 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3716 */ 3717 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3718 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3719 is_double_fault(exit_intr_info))) { 3720 vmcs12->idt_vectoring_info_field = 0; 3721 } else if (vcpu->arch.exception.injected) { 3722 nr = vcpu->arch.exception.nr; 3723 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3724 3725 if (kvm_exception_is_soft(nr)) { 3726 vmcs12->vm_exit_instruction_len = 3727 vcpu->arch.event_exit_inst_len; 3728 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3729 } else 3730 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3731 3732 if (vcpu->arch.exception.has_error_code) { 3733 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3734 vmcs12->idt_vectoring_error_code = 3735 vcpu->arch.exception.error_code; 3736 } 3737 3738 vmcs12->idt_vectoring_info_field = idt_vectoring; 3739 } else if (vcpu->arch.nmi_injected) { 3740 vmcs12->idt_vectoring_info_field = 3741 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3742 } else if (vcpu->arch.interrupt.injected) { 3743 nr = vcpu->arch.interrupt.nr; 3744 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3745 3746 if (vcpu->arch.interrupt.soft) { 3747 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3748 vmcs12->vm_entry_instruction_len = 3749 vcpu->arch.event_exit_inst_len; 3750 } else 3751 idt_vectoring |= INTR_TYPE_EXT_INTR; 3752 3753 vmcs12->idt_vectoring_info_field = idt_vectoring; 3754 } else { 3755 vmcs12->idt_vectoring_info_field = 0; 3756 } 3757 } 3758 3759 3760 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3761 { 3762 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3763 gfn_t gfn; 3764 3765 /* 3766 * Don't need to mark the APIC access page dirty; it is never 3767 * written to by the CPU during APIC virtualization. 3768 */ 3769 3770 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3771 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3772 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3773 } 3774 3775 if (nested_cpu_has_posted_intr(vmcs12)) { 3776 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3777 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3778 } 3779 } 3780 3781 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3782 { 3783 struct vcpu_vmx *vmx = to_vmx(vcpu); 3784 int max_irr; 3785 void *vapic_page; 3786 u16 status; 3787 3788 if (!vmx->nested.pi_pending) 3789 return 0; 3790 3791 if (!vmx->nested.pi_desc) 3792 goto mmio_needed; 3793 3794 vmx->nested.pi_pending = false; 3795 3796 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3797 return 0; 3798 3799 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3800 if (max_irr != 256) { 3801 vapic_page = vmx->nested.virtual_apic_map.hva; 3802 if (!vapic_page) 3803 goto mmio_needed; 3804 3805 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3806 vapic_page, &max_irr); 3807 status = vmcs_read16(GUEST_INTR_STATUS); 3808 if ((u8)max_irr > ((u8)status & 0xff)) { 3809 status &= ~0xff; 3810 status |= (u8)max_irr; 3811 vmcs_write16(GUEST_INTR_STATUS, status); 3812 } 3813 } 3814 3815 nested_mark_vmcs12_pages_dirty(vcpu); 3816 return 0; 3817 3818 mmio_needed: 3819 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3820 return -ENXIO; 3821 } 3822 3823 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3824 unsigned long exit_qual) 3825 { 3826 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3827 unsigned int nr = vcpu->arch.exception.nr; 3828 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3829 3830 if (vcpu->arch.exception.has_error_code) { 3831 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3832 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3833 } 3834 3835 if (kvm_exception_is_soft(nr)) 3836 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3837 else 3838 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3839 3840 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3841 vmx_get_nmi_mask(vcpu)) 3842 intr_info |= INTR_INFO_UNBLOCK_NMI; 3843 3844 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3845 } 3846 3847 /* 3848 * Returns true if a debug trap is pending delivery. 3849 * 3850 * In KVM, debug traps bear an exception payload. As such, the class of a #DB 3851 * exception may be inferred from the presence of an exception payload. 3852 */ 3853 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) 3854 { 3855 return vcpu->arch.exception.pending && 3856 vcpu->arch.exception.nr == DB_VECTOR && 3857 vcpu->arch.exception.payload; 3858 } 3859 3860 /* 3861 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3862 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3863 * represents these debug traps with a payload that is said to be compatible 3864 * with the 'pending debug exceptions' field, write the payload to the VMCS 3865 * field if a VM-exit is delivered before the debug trap. 3866 */ 3867 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3868 { 3869 if (vmx_pending_dbg_trap(vcpu)) 3870 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 3871 vcpu->arch.exception.payload); 3872 } 3873 3874 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3875 { 3876 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3877 to_vmx(vcpu)->nested.preemption_timer_expired; 3878 } 3879 3880 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 3881 { 3882 struct vcpu_vmx *vmx = to_vmx(vcpu); 3883 unsigned long exit_qual; 3884 bool block_nested_events = 3885 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3886 bool mtf_pending = vmx->nested.mtf_pending; 3887 struct kvm_lapic *apic = vcpu->arch.apic; 3888 3889 /* 3890 * Clear the MTF state. If a higher priority VM-exit is delivered first, 3891 * this state is discarded. 3892 */ 3893 if (!block_nested_events) 3894 vmx->nested.mtf_pending = false; 3895 3896 if (lapic_in_kernel(vcpu) && 3897 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3898 if (block_nested_events) 3899 return -EBUSY; 3900 nested_vmx_update_pending_dbg(vcpu); 3901 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3902 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 3903 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3904 return 0; 3905 } 3906 3907 if (lapic_in_kernel(vcpu) && 3908 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 3909 if (block_nested_events) 3910 return -EBUSY; 3911 3912 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 3913 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 3914 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 3915 apic->sipi_vector & 0xFFUL); 3916 return 0; 3917 } 3918 3919 /* 3920 * Process any exceptions that are not debug traps before MTF. 3921 * 3922 * Note that only a pending nested run can block a pending exception. 3923 * Otherwise an injected NMI/interrupt should either be 3924 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, 3925 * while delivering the pending exception. 3926 */ 3927 3928 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { 3929 if (vmx->nested.nested_run_pending) 3930 return -EBUSY; 3931 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3932 goto no_vmexit; 3933 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3934 return 0; 3935 } 3936 3937 if (mtf_pending) { 3938 if (block_nested_events) 3939 return -EBUSY; 3940 nested_vmx_update_pending_dbg(vcpu); 3941 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 3942 return 0; 3943 } 3944 3945 if (vcpu->arch.exception.pending) { 3946 if (vmx->nested.nested_run_pending) 3947 return -EBUSY; 3948 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3949 goto no_vmexit; 3950 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3951 return 0; 3952 } 3953 3954 if (nested_vmx_preemption_timer_pending(vcpu)) { 3955 if (block_nested_events) 3956 return -EBUSY; 3957 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3958 return 0; 3959 } 3960 3961 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 3962 if (block_nested_events) 3963 return -EBUSY; 3964 goto no_vmexit; 3965 } 3966 3967 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 3968 if (block_nested_events) 3969 return -EBUSY; 3970 if (!nested_exit_on_nmi(vcpu)) 3971 goto no_vmexit; 3972 3973 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3974 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3975 INTR_INFO_VALID_MASK, 0); 3976 /* 3977 * The NMI-triggered VM exit counts as injection: 3978 * clear this one and block further NMIs. 3979 */ 3980 vcpu->arch.nmi_pending = 0; 3981 vmx_set_nmi_mask(vcpu, true); 3982 return 0; 3983 } 3984 3985 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 3986 if (block_nested_events) 3987 return -EBUSY; 3988 if (!nested_exit_on_intr(vcpu)) 3989 goto no_vmexit; 3990 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3991 return 0; 3992 } 3993 3994 no_vmexit: 3995 return vmx_complete_nested_posted_interrupt(vcpu); 3996 } 3997 3998 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3999 { 4000 ktime_t remaining = 4001 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4002 u64 value; 4003 4004 if (ktime_to_ns(remaining) <= 0) 4005 return 0; 4006 4007 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4008 do_div(value, 1000000); 4009 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4010 } 4011 4012 static bool is_vmcs12_ext_field(unsigned long field) 4013 { 4014 switch (field) { 4015 case GUEST_ES_SELECTOR: 4016 case GUEST_CS_SELECTOR: 4017 case GUEST_SS_SELECTOR: 4018 case GUEST_DS_SELECTOR: 4019 case GUEST_FS_SELECTOR: 4020 case GUEST_GS_SELECTOR: 4021 case GUEST_LDTR_SELECTOR: 4022 case GUEST_TR_SELECTOR: 4023 case GUEST_ES_LIMIT: 4024 case GUEST_CS_LIMIT: 4025 case GUEST_SS_LIMIT: 4026 case GUEST_DS_LIMIT: 4027 case GUEST_FS_LIMIT: 4028 case GUEST_GS_LIMIT: 4029 case GUEST_LDTR_LIMIT: 4030 case GUEST_TR_LIMIT: 4031 case GUEST_GDTR_LIMIT: 4032 case GUEST_IDTR_LIMIT: 4033 case GUEST_ES_AR_BYTES: 4034 case GUEST_DS_AR_BYTES: 4035 case GUEST_FS_AR_BYTES: 4036 case GUEST_GS_AR_BYTES: 4037 case GUEST_LDTR_AR_BYTES: 4038 case GUEST_TR_AR_BYTES: 4039 case GUEST_ES_BASE: 4040 case GUEST_CS_BASE: 4041 case GUEST_SS_BASE: 4042 case GUEST_DS_BASE: 4043 case GUEST_FS_BASE: 4044 case GUEST_GS_BASE: 4045 case GUEST_LDTR_BASE: 4046 case GUEST_TR_BASE: 4047 case GUEST_GDTR_BASE: 4048 case GUEST_IDTR_BASE: 4049 case GUEST_PENDING_DBG_EXCEPTIONS: 4050 case GUEST_BNDCFGS: 4051 return true; 4052 default: 4053 break; 4054 } 4055 4056 return false; 4057 } 4058 4059 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4060 struct vmcs12 *vmcs12) 4061 { 4062 struct vcpu_vmx *vmx = to_vmx(vcpu); 4063 4064 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4065 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4066 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4067 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4068 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4069 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4070 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4071 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4072 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4073 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4074 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4075 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4076 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4077 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4078 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4079 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4080 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4081 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4082 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4083 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4084 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4085 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4086 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4087 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4088 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4089 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4090 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4091 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4092 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4093 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4094 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4095 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4096 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4097 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4098 vmcs12->guest_pending_dbg_exceptions = 4099 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4100 if (kvm_mpx_supported()) 4101 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 4102 4103 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4104 } 4105 4106 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4107 struct vmcs12 *vmcs12) 4108 { 4109 struct vcpu_vmx *vmx = to_vmx(vcpu); 4110 int cpu; 4111 4112 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4113 return; 4114 4115 4116 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4117 4118 cpu = get_cpu(); 4119 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4120 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4121 4122 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4123 4124 vmx->loaded_vmcs = &vmx->vmcs01; 4125 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4126 put_cpu(); 4127 } 4128 4129 /* 4130 * Update the guest state fields of vmcs12 to reflect changes that 4131 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4132 * VM-entry controls is also updated, since this is really a guest 4133 * state bit.) 4134 */ 4135 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4136 { 4137 struct vcpu_vmx *vmx = to_vmx(vcpu); 4138 4139 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 4140 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4141 4142 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4143 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); 4144 4145 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4146 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4147 4148 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4149 vmcs12->guest_rip = kvm_rip_read(vcpu); 4150 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4151 4152 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4153 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4154 4155 vmcs12->guest_interruptibility_info = 4156 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4157 4158 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4159 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4160 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4161 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4162 else 4163 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4164 4165 if (nested_cpu_has_preemption_timer(vmcs12) && 4166 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4167 !vmx->nested.nested_run_pending) 4168 vmcs12->vmx_preemption_timer_value = 4169 vmx_get_preemption_timer_value(vcpu); 4170 4171 /* 4172 * In some cases (usually, nested EPT), L2 is allowed to change its 4173 * own CR3 without exiting. If it has changed it, we must keep it. 4174 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4175 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4176 * 4177 * Additionally, restore L2's PDPTR to vmcs12. 4178 */ 4179 if (enable_ept) { 4180 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4181 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4182 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4183 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4184 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4185 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4186 } 4187 } 4188 4189 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4190 4191 if (nested_cpu_has_vid(vmcs12)) 4192 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4193 4194 vmcs12->vm_entry_controls = 4195 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4196 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4197 4198 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4199 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4200 4201 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4202 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4203 } 4204 4205 /* 4206 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4207 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4208 * and this function updates it to reflect the changes to the guest state while 4209 * L2 was running (and perhaps made some exits which were handled directly by L0 4210 * without going back to L1), and to reflect the exit reason. 4211 * Note that we do not have to copy here all VMCS fields, just those that 4212 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4213 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4214 * which already writes to vmcs12 directly. 4215 */ 4216 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4217 u32 vm_exit_reason, u32 exit_intr_info, 4218 unsigned long exit_qualification) 4219 { 4220 /* update exit information fields: */ 4221 vmcs12->vm_exit_reason = vm_exit_reason; 4222 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4223 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4224 vmcs12->exit_qualification = exit_qualification; 4225 4226 /* 4227 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4228 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4229 * exit info fields are unmodified. 4230 */ 4231 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4232 vmcs12->launch_state = 1; 4233 4234 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4235 * instead of reading the real value. */ 4236 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4237 4238 /* 4239 * Transfer the event that L0 or L1 may wanted to inject into 4240 * L2 to IDT_VECTORING_INFO_FIELD. 4241 */ 4242 vmcs12_save_pending_event(vcpu, vmcs12, 4243 vm_exit_reason, exit_intr_info); 4244 4245 vmcs12->vm_exit_intr_info = exit_intr_info; 4246 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4247 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4248 4249 /* 4250 * According to spec, there's no need to store the guest's 4251 * MSRs if the exit is due to a VM-entry failure that occurs 4252 * during or after loading the guest state. Since this exit 4253 * does not fall in that category, we need to save the MSRs. 4254 */ 4255 if (nested_vmx_store_msr(vcpu, 4256 vmcs12->vm_exit_msr_store_addr, 4257 vmcs12->vm_exit_msr_store_count)) 4258 nested_vmx_abort(vcpu, 4259 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4260 } 4261 4262 /* 4263 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 4264 * preserved above and would only end up incorrectly in L1. 4265 */ 4266 vcpu->arch.nmi_injected = false; 4267 kvm_clear_exception_queue(vcpu); 4268 kvm_clear_interrupt_queue(vcpu); 4269 } 4270 4271 /* 4272 * A part of what we need to when the nested L2 guest exits and we want to 4273 * run its L1 parent, is to reset L1's guest state to the host state specified 4274 * in vmcs12. 4275 * This function is to be called not only on normal nested exit, but also on 4276 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4277 * Failures During or After Loading Guest State"). 4278 * This function should be called when the active VMCS is L1's (vmcs01). 4279 */ 4280 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4281 struct vmcs12 *vmcs12) 4282 { 4283 enum vm_entry_failure_code ignored; 4284 struct kvm_segment seg; 4285 4286 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4287 vcpu->arch.efer = vmcs12->host_ia32_efer; 4288 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4289 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4290 else 4291 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4292 vmx_set_efer(vcpu, vcpu->arch.efer); 4293 4294 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4295 kvm_rip_write(vcpu, vmcs12->host_rip); 4296 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4297 vmx_set_interrupt_shadow(vcpu, 0); 4298 4299 /* 4300 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4301 * actually changed, because vmx_set_cr0 refers to efer set above. 4302 * 4303 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4304 * (KVM doesn't change it); 4305 */ 4306 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4307 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4308 4309 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4310 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4311 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4312 4313 nested_ept_uninit_mmu_context(vcpu); 4314 4315 /* 4316 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4317 * couldn't have changed. 4318 */ 4319 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4320 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4321 4322 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4323 4324 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4325 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4326 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4327 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4328 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4329 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4330 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4331 4332 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4333 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4334 vmcs_write64(GUEST_BNDCFGS, 0); 4335 4336 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4337 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4338 vcpu->arch.pat = vmcs12->host_ia32_pat; 4339 } 4340 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 4341 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4342 vmcs12->host_ia32_perf_global_ctrl)); 4343 4344 /* Set L1 segment info according to Intel SDM 4345 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4346 seg = (struct kvm_segment) { 4347 .base = 0, 4348 .limit = 0xFFFFFFFF, 4349 .selector = vmcs12->host_cs_selector, 4350 .type = 11, 4351 .present = 1, 4352 .s = 1, 4353 .g = 1 4354 }; 4355 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4356 seg.l = 1; 4357 else 4358 seg.db = 1; 4359 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4360 seg = (struct kvm_segment) { 4361 .base = 0, 4362 .limit = 0xFFFFFFFF, 4363 .type = 3, 4364 .present = 1, 4365 .s = 1, 4366 .db = 1, 4367 .g = 1 4368 }; 4369 seg.selector = vmcs12->host_ds_selector; 4370 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4371 seg.selector = vmcs12->host_es_selector; 4372 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4373 seg.selector = vmcs12->host_ss_selector; 4374 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4375 seg.selector = vmcs12->host_fs_selector; 4376 seg.base = vmcs12->host_fs_base; 4377 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4378 seg.selector = vmcs12->host_gs_selector; 4379 seg.base = vmcs12->host_gs_base; 4380 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4381 seg = (struct kvm_segment) { 4382 .base = vmcs12->host_tr_base, 4383 .limit = 0x67, 4384 .selector = vmcs12->host_tr_selector, 4385 .type = 11, 4386 .present = 1 4387 }; 4388 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4389 4390 memset(&seg, 0, sizeof(seg)); 4391 seg.unusable = 1; 4392 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4393 4394 kvm_set_dr(vcpu, 7, 0x400); 4395 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4396 4397 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4398 vmcs12->vm_exit_msr_load_count)) 4399 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4400 4401 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4402 } 4403 4404 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4405 { 4406 struct vmx_uret_msr *efer_msr; 4407 unsigned int i; 4408 4409 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4410 return vmcs_read64(GUEST_IA32_EFER); 4411 4412 if (cpu_has_load_ia32_efer()) 4413 return host_efer; 4414 4415 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4416 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4417 return vmx->msr_autoload.guest.val[i].value; 4418 } 4419 4420 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4421 if (efer_msr) 4422 return efer_msr->data; 4423 4424 return host_efer; 4425 } 4426 4427 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4428 { 4429 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4430 struct vcpu_vmx *vmx = to_vmx(vcpu); 4431 struct vmx_msr_entry g, h; 4432 gpa_t gpa; 4433 u32 i, j; 4434 4435 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4436 4437 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4438 /* 4439 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4440 * as vmcs01.GUEST_DR7 contains a userspace defined value 4441 * and vcpu->arch.dr7 is not squirreled away before the 4442 * nested VMENTER (not worth adding a variable in nested_vmx). 4443 */ 4444 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4445 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4446 else 4447 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4448 } 4449 4450 /* 4451 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4452 * handle a variety of side effects to KVM's software model. 4453 */ 4454 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4455 4456 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4457 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4458 4459 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4460 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4461 4462 nested_ept_uninit_mmu_context(vcpu); 4463 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4464 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4465 4466 /* 4467 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4468 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4469 * VMFail, like everything else we just need to ensure our 4470 * software model is up-to-date. 4471 */ 4472 if (enable_ept && is_pae_paging(vcpu)) 4473 ept_save_pdptrs(vcpu); 4474 4475 kvm_mmu_reset_context(vcpu); 4476 4477 /* 4478 * This nasty bit of open coding is a compromise between blindly 4479 * loading L1's MSRs using the exit load lists (incorrect emulation 4480 * of VMFail), leaving the nested VM's MSRs in the software model 4481 * (incorrect behavior) and snapshotting the modified MSRs (too 4482 * expensive since the lists are unbound by hardware). For each 4483 * MSR that was (prematurely) loaded from the nested VMEntry load 4484 * list, reload it from the exit load list if it exists and differs 4485 * from the guest value. The intent is to stuff host state as 4486 * silently as possible, not to fully process the exit load list. 4487 */ 4488 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4489 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4490 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4491 pr_debug_ratelimited( 4492 "%s read MSR index failed (%u, 0x%08llx)\n", 4493 __func__, i, gpa); 4494 goto vmabort; 4495 } 4496 4497 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4498 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4499 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4500 pr_debug_ratelimited( 4501 "%s read MSR failed (%u, 0x%08llx)\n", 4502 __func__, j, gpa); 4503 goto vmabort; 4504 } 4505 if (h.index != g.index) 4506 continue; 4507 if (h.value == g.value) 4508 break; 4509 4510 if (nested_vmx_load_msr_check(vcpu, &h)) { 4511 pr_debug_ratelimited( 4512 "%s check failed (%u, 0x%x, 0x%x)\n", 4513 __func__, j, h.index, h.reserved); 4514 goto vmabort; 4515 } 4516 4517 if (kvm_set_msr(vcpu, h.index, h.value)) { 4518 pr_debug_ratelimited( 4519 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4520 __func__, j, h.index, h.value); 4521 goto vmabort; 4522 } 4523 } 4524 } 4525 4526 return; 4527 4528 vmabort: 4529 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4530 } 4531 4532 /* 4533 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4534 * and modify vmcs12 to make it see what it would expect to see there if 4535 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4536 */ 4537 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4538 u32 exit_intr_info, unsigned long exit_qualification) 4539 { 4540 struct vcpu_vmx *vmx = to_vmx(vcpu); 4541 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4542 4543 /* trying to cancel vmlaunch/vmresume is a bug */ 4544 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4545 4546 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4547 /* 4548 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4549 * Enlightened VMCS after migration and we still need to 4550 * do that when something is forcing L2->L1 exit prior to 4551 * the first L2 run. 4552 */ 4553 (void)nested_get_evmcs_page(vcpu); 4554 } 4555 4556 /* Service pending TLB flush requests for L2 before switching to L1. */ 4557 kvm_service_local_tlb_flush_requests(vcpu); 4558 4559 /* 4560 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4561 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4562 * up-to-date before switching to L1. 4563 */ 4564 if (enable_ept && is_pae_paging(vcpu)) 4565 vmx_ept_load_pdptrs(vcpu); 4566 4567 leave_guest_mode(vcpu); 4568 4569 if (nested_cpu_has_preemption_timer(vmcs12)) 4570 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4571 4572 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4573 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4574 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4575 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4576 } 4577 4578 if (likely(!vmx->fail)) { 4579 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4580 4581 if (vm_exit_reason != -1) 4582 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4583 exit_intr_info, exit_qualification); 4584 4585 /* 4586 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4587 * also be used to capture vmcs12 cache as part of 4588 * capturing nVMX state for snapshot (migration). 4589 * 4590 * Otherwise, this flush will dirty guest memory at a 4591 * point it is already assumed by user-space to be 4592 * immutable. 4593 */ 4594 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4595 } else { 4596 /* 4597 * The only expected VM-instruction error is "VM entry with 4598 * invalid control field(s)." Anything else indicates a 4599 * problem with L0. And we should never get here with a 4600 * VMFail of any type if early consistency checks are enabled. 4601 */ 4602 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4603 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4604 WARN_ON_ONCE(nested_early_check); 4605 } 4606 4607 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4608 4609 /* Update any VMCS fields that might have changed while L2 ran */ 4610 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4611 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4612 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4613 if (kvm_has_tsc_control) 4614 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4615 4616 if (vmx->nested.l1_tpr_threshold != -1) 4617 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4618 4619 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4620 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4621 vmx_set_virtual_apic_mode(vcpu); 4622 } 4623 4624 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4625 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4626 vmx_update_cpu_dirty_logging(vcpu); 4627 } 4628 4629 /* Unpin physical memory we referred to in vmcs02 */ 4630 if (vmx->nested.apic_access_page) { 4631 kvm_release_page_clean(vmx->nested.apic_access_page); 4632 vmx->nested.apic_access_page = NULL; 4633 } 4634 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4635 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4636 vmx->nested.pi_desc = NULL; 4637 4638 if (vmx->nested.reload_vmcs01_apic_access_page) { 4639 vmx->nested.reload_vmcs01_apic_access_page = false; 4640 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4641 } 4642 4643 if (vmx->nested.update_vmcs01_apicv_status) { 4644 vmx->nested.update_vmcs01_apicv_status = false; 4645 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 4646 } 4647 4648 if ((vm_exit_reason != -1) && 4649 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) 4650 vmx->nested.need_vmcs12_to_shadow_sync = true; 4651 4652 /* in case we halted in L2 */ 4653 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4654 4655 if (likely(!vmx->fail)) { 4656 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4657 nested_exit_intr_ack_set(vcpu)) { 4658 int irq = kvm_cpu_get_interrupt(vcpu); 4659 WARN_ON(irq < 0); 4660 vmcs12->vm_exit_intr_info = irq | 4661 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4662 } 4663 4664 if (vm_exit_reason != -1) 4665 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4666 vmcs12->exit_qualification, 4667 vmcs12->idt_vectoring_info_field, 4668 vmcs12->vm_exit_intr_info, 4669 vmcs12->vm_exit_intr_error_code, 4670 KVM_ISA_VMX); 4671 4672 load_vmcs12_host_state(vcpu, vmcs12); 4673 4674 return; 4675 } 4676 4677 /* 4678 * After an early L2 VM-entry failure, we're now back 4679 * in L1 which thinks it just finished a VMLAUNCH or 4680 * VMRESUME instruction, so we need to set the failure 4681 * flag and the VM-instruction error field of the VMCS 4682 * accordingly, and skip the emulated instruction. 4683 */ 4684 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4685 4686 /* 4687 * Restore L1's host state to KVM's software model. We're here 4688 * because a consistency check was caught by hardware, which 4689 * means some amount of guest state has been propagated to KVM's 4690 * model and needs to be unwound to the host's state. 4691 */ 4692 nested_vmx_restore_host_state(vcpu); 4693 4694 vmx->fail = 0; 4695 } 4696 4697 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4698 { 4699 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4700 } 4701 4702 /* 4703 * Decode the memory-address operand of a vmx instruction, as recorded on an 4704 * exit caused by such an instruction (run by a guest hypervisor). 4705 * On success, returns 0. When the operand is invalid, returns 1 and throws 4706 * #UD, #GP, or #SS. 4707 */ 4708 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4709 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4710 { 4711 gva_t off; 4712 bool exn; 4713 struct kvm_segment s; 4714 4715 /* 4716 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4717 * Execution", on an exit, vmx_instruction_info holds most of the 4718 * addressing components of the operand. Only the displacement part 4719 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4720 * For how an actual address is calculated from all these components, 4721 * refer to Vol. 1, "Operand Addressing". 4722 */ 4723 int scaling = vmx_instruction_info & 3; 4724 int addr_size = (vmx_instruction_info >> 7) & 7; 4725 bool is_reg = vmx_instruction_info & (1u << 10); 4726 int seg_reg = (vmx_instruction_info >> 15) & 7; 4727 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4728 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4729 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4730 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4731 4732 if (is_reg) { 4733 kvm_queue_exception(vcpu, UD_VECTOR); 4734 return 1; 4735 } 4736 4737 /* Addr = segment_base + offset */ 4738 /* offset = base + [index * scale] + displacement */ 4739 off = exit_qualification; /* holds the displacement */ 4740 if (addr_size == 1) 4741 off = (gva_t)sign_extend64(off, 31); 4742 else if (addr_size == 0) 4743 off = (gva_t)sign_extend64(off, 15); 4744 if (base_is_valid) 4745 off += kvm_register_read(vcpu, base_reg); 4746 if (index_is_valid) 4747 off += kvm_register_read(vcpu, index_reg) << scaling; 4748 vmx_get_segment(vcpu, &s, seg_reg); 4749 4750 /* 4751 * The effective address, i.e. @off, of a memory operand is truncated 4752 * based on the address size of the instruction. Note that this is 4753 * the *effective address*, i.e. the address prior to accounting for 4754 * the segment's base. 4755 */ 4756 if (addr_size == 1) /* 32 bit */ 4757 off &= 0xffffffff; 4758 else if (addr_size == 0) /* 16 bit */ 4759 off &= 0xffff; 4760 4761 /* Checks for #GP/#SS exceptions. */ 4762 exn = false; 4763 if (is_long_mode(vcpu)) { 4764 /* 4765 * The virtual/linear address is never truncated in 64-bit 4766 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4767 * address when using FS/GS with a non-zero base. 4768 */ 4769 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4770 *ret = s.base + off; 4771 else 4772 *ret = off; 4773 4774 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4775 * non-canonical form. This is the only check on the memory 4776 * destination for long mode! 4777 */ 4778 exn = is_noncanonical_address(*ret, vcpu); 4779 } else { 4780 /* 4781 * When not in long mode, the virtual/linear address is 4782 * unconditionally truncated to 32 bits regardless of the 4783 * address size. 4784 */ 4785 *ret = (s.base + off) & 0xffffffff; 4786 4787 /* Protected mode: apply checks for segment validity in the 4788 * following order: 4789 * - segment type check (#GP(0) may be thrown) 4790 * - usability check (#GP(0)/#SS(0)) 4791 * - limit check (#GP(0)/#SS(0)) 4792 */ 4793 if (wr) 4794 /* #GP(0) if the destination operand is located in a 4795 * read-only data segment or any code segment. 4796 */ 4797 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4798 else 4799 /* #GP(0) if the source operand is located in an 4800 * execute-only code segment 4801 */ 4802 exn = ((s.type & 0xa) == 8); 4803 if (exn) { 4804 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4805 return 1; 4806 } 4807 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4808 */ 4809 exn = (s.unusable != 0); 4810 4811 /* 4812 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4813 * outside the segment limit. All CPUs that support VMX ignore 4814 * limit checks for flat segments, i.e. segments with base==0, 4815 * limit==0xffffffff and of type expand-up data or code. 4816 */ 4817 if (!(s.base == 0 && s.limit == 0xffffffff && 4818 ((s.type & 8) || !(s.type & 4)))) 4819 exn = exn || ((u64)off + len - 1 > s.limit); 4820 } 4821 if (exn) { 4822 kvm_queue_exception_e(vcpu, 4823 seg_reg == VCPU_SREG_SS ? 4824 SS_VECTOR : GP_VECTOR, 4825 0); 4826 return 1; 4827 } 4828 4829 return 0; 4830 } 4831 4832 void nested_vmx_pmu_refresh(struct kvm_vcpu *vcpu, 4833 bool vcpu_has_perf_global_ctrl) 4834 { 4835 struct vcpu_vmx *vmx; 4836 4837 if (!nested_vmx_allowed(vcpu)) 4838 return; 4839 4840 vmx = to_vmx(vcpu); 4841 if (vcpu_has_perf_global_ctrl) { 4842 vmx->nested.msrs.entry_ctls_high |= 4843 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4844 vmx->nested.msrs.exit_ctls_high |= 4845 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4846 } else { 4847 vmx->nested.msrs.entry_ctls_high &= 4848 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4849 vmx->nested.msrs.exit_ctls_high &= 4850 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4851 } 4852 } 4853 4854 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 4855 int *ret) 4856 { 4857 gva_t gva; 4858 struct x86_exception e; 4859 int r; 4860 4861 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 4862 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4863 sizeof(*vmpointer), &gva)) { 4864 *ret = 1; 4865 return -EINVAL; 4866 } 4867 4868 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 4869 if (r != X86EMUL_CONTINUE) { 4870 *ret = kvm_handle_memory_failure(vcpu, r, &e); 4871 return -EINVAL; 4872 } 4873 4874 return 0; 4875 } 4876 4877 /* 4878 * Allocate a shadow VMCS and associate it with the currently loaded 4879 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4880 * VMCS is also VMCLEARed, so that it is ready for use. 4881 */ 4882 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4883 { 4884 struct vcpu_vmx *vmx = to_vmx(vcpu); 4885 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4886 4887 /* 4888 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 4889 * when L1 executes VMXOFF or the vCPU is forced out of nested 4890 * operation. VMXON faults if the CPU is already post-VMXON, so it 4891 * should be impossible to already have an allocated shadow VMCS. KVM 4892 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 4893 * always be the loaded VMCS. 4894 */ 4895 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 4896 return loaded_vmcs->shadow_vmcs; 4897 4898 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4899 if (loaded_vmcs->shadow_vmcs) 4900 vmcs_clear(loaded_vmcs->shadow_vmcs); 4901 4902 return loaded_vmcs->shadow_vmcs; 4903 } 4904 4905 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4906 { 4907 struct vcpu_vmx *vmx = to_vmx(vcpu); 4908 int r; 4909 4910 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4911 if (r < 0) 4912 goto out_vmcs02; 4913 4914 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4915 if (!vmx->nested.cached_vmcs12) 4916 goto out_cached_vmcs12; 4917 4918 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 4919 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4920 if (!vmx->nested.cached_shadow_vmcs12) 4921 goto out_cached_shadow_vmcs12; 4922 4923 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4924 goto out_shadow_vmcs; 4925 4926 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4927 HRTIMER_MODE_ABS_PINNED); 4928 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4929 4930 vmx->nested.vpid02 = allocate_vpid(); 4931 4932 vmx->nested.vmcs02_initialized = false; 4933 vmx->nested.vmxon = true; 4934 4935 if (vmx_pt_mode_is_host_guest()) { 4936 vmx->pt_desc.guest.ctl = 0; 4937 pt_update_intercept_for_msr(vcpu); 4938 } 4939 4940 return 0; 4941 4942 out_shadow_vmcs: 4943 kfree(vmx->nested.cached_shadow_vmcs12); 4944 4945 out_cached_shadow_vmcs12: 4946 kfree(vmx->nested.cached_vmcs12); 4947 4948 out_cached_vmcs12: 4949 free_loaded_vmcs(&vmx->nested.vmcs02); 4950 4951 out_vmcs02: 4952 return -ENOMEM; 4953 } 4954 4955 /* Emulate the VMXON instruction. */ 4956 static int handle_vmon(struct kvm_vcpu *vcpu) 4957 { 4958 int ret; 4959 gpa_t vmptr; 4960 uint32_t revision; 4961 struct vcpu_vmx *vmx = to_vmx(vcpu); 4962 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 4963 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 4964 4965 /* 4966 * The Intel VMX Instruction Reference lists a bunch of bits that are 4967 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4968 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). 4969 * Otherwise, we should fail with #UD. But most faulting conditions 4970 * have already been checked by hardware, prior to the VM-exit for 4971 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4972 * that bit set to 1 in non-root mode. 4973 */ 4974 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4975 kvm_queue_exception(vcpu, UD_VECTOR); 4976 return 1; 4977 } 4978 4979 /* CPL=0 must be checked manually. */ 4980 if (vmx_get_cpl(vcpu)) { 4981 kvm_inject_gp(vcpu, 0); 4982 return 1; 4983 } 4984 4985 if (vmx->nested.vmxon) 4986 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4987 4988 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4989 != VMXON_NEEDED_FEATURES) { 4990 kvm_inject_gp(vcpu, 0); 4991 return 1; 4992 } 4993 4994 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 4995 return ret; 4996 4997 /* 4998 * SDM 3: 24.11.5 4999 * The first 4 bytes of VMXON region contain the supported 5000 * VMCS revision identifier 5001 * 5002 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5003 * which replaces physical address width with 32 5004 */ 5005 if (!page_address_valid(vcpu, vmptr)) 5006 return nested_vmx_failInvalid(vcpu); 5007 5008 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5009 revision != VMCS12_REVISION) 5010 return nested_vmx_failInvalid(vcpu); 5011 5012 vmx->nested.vmxon_ptr = vmptr; 5013 ret = enter_vmx_operation(vcpu); 5014 if (ret) 5015 return ret; 5016 5017 return nested_vmx_succeed(vcpu); 5018 } 5019 5020 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5021 { 5022 struct vcpu_vmx *vmx = to_vmx(vcpu); 5023 5024 if (vmx->nested.current_vmptr == INVALID_GPA) 5025 return; 5026 5027 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5028 5029 if (enable_shadow_vmcs) { 5030 /* copy to memory all shadowed fields in case 5031 they were modified */ 5032 copy_shadow_to_vmcs12(vmx); 5033 vmx_disable_shadow_vmcs(vmx); 5034 } 5035 vmx->nested.posted_intr_nv = -1; 5036 5037 /* Flush VMCS12 to guest memory */ 5038 kvm_vcpu_write_guest_page(vcpu, 5039 vmx->nested.current_vmptr >> PAGE_SHIFT, 5040 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5041 5042 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5043 5044 vmx->nested.current_vmptr = INVALID_GPA; 5045 } 5046 5047 /* Emulate the VMXOFF instruction */ 5048 static int handle_vmoff(struct kvm_vcpu *vcpu) 5049 { 5050 if (!nested_vmx_check_permission(vcpu)) 5051 return 1; 5052 5053 free_nested(vcpu); 5054 5055 /* Process a latched INIT during time CPU was in VMX operation */ 5056 kvm_make_request(KVM_REQ_EVENT, vcpu); 5057 5058 return nested_vmx_succeed(vcpu); 5059 } 5060 5061 /* Emulate the VMCLEAR instruction */ 5062 static int handle_vmclear(struct kvm_vcpu *vcpu) 5063 { 5064 struct vcpu_vmx *vmx = to_vmx(vcpu); 5065 u32 zero = 0; 5066 gpa_t vmptr; 5067 u64 evmcs_gpa; 5068 int r; 5069 5070 if (!nested_vmx_check_permission(vcpu)) 5071 return 1; 5072 5073 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5074 return r; 5075 5076 if (!page_address_valid(vcpu, vmptr)) 5077 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5078 5079 if (vmptr == vmx->nested.vmxon_ptr) 5080 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5081 5082 /* 5083 * When Enlightened VMEntry is enabled on the calling CPU we treat 5084 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 5085 * way to distinguish it from VMCS12) and we must not corrupt it by 5086 * writing to the non-existent 'launch_state' field. The area doesn't 5087 * have to be the currently active EVMCS on the calling CPU and there's 5088 * nothing KVM has to do to transition it from 'active' to 'non-active' 5089 * state. It is possible that the area will stay mapped as 5090 * vmx->nested.hv_evmcs but this shouldn't be a problem. 5091 */ 5092 if (likely(!vmx->nested.enlightened_vmcs_enabled || 5093 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 5094 if (vmptr == vmx->nested.current_vmptr) 5095 nested_release_vmcs12(vcpu); 5096 5097 kvm_vcpu_write_guest(vcpu, 5098 vmptr + offsetof(struct vmcs12, 5099 launch_state), 5100 &zero, sizeof(zero)); 5101 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { 5102 nested_release_evmcs(vcpu); 5103 } 5104 5105 return nested_vmx_succeed(vcpu); 5106 } 5107 5108 /* Emulate the VMLAUNCH instruction */ 5109 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5110 { 5111 return nested_vmx_run(vcpu, true); 5112 } 5113 5114 /* Emulate the VMRESUME instruction */ 5115 static int handle_vmresume(struct kvm_vcpu *vcpu) 5116 { 5117 5118 return nested_vmx_run(vcpu, false); 5119 } 5120 5121 static int handle_vmread(struct kvm_vcpu *vcpu) 5122 { 5123 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5124 : get_vmcs12(vcpu); 5125 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5126 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5127 struct vcpu_vmx *vmx = to_vmx(vcpu); 5128 struct x86_exception e; 5129 unsigned long field; 5130 u64 value; 5131 gva_t gva = 0; 5132 short offset; 5133 int len, r; 5134 5135 if (!nested_vmx_check_permission(vcpu)) 5136 return 1; 5137 5138 /* Decode instruction info and find the field to read */ 5139 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5140 5141 if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 5142 /* 5143 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5144 * any VMREAD sets the ALU flags for VMfailInvalid. 5145 */ 5146 if (vmx->nested.current_vmptr == INVALID_GPA || 5147 (is_guest_mode(vcpu) && 5148 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5149 return nested_vmx_failInvalid(vcpu); 5150 5151 offset = get_vmcs12_field_offset(field); 5152 if (offset < 0) 5153 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5154 5155 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5156 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5157 5158 /* Read the field, zero-extended to a u64 value */ 5159 value = vmcs12_read_any(vmcs12, field, offset); 5160 } else { 5161 /* 5162 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5163 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5164 * unsupported. Unfortunately, certain versions of Windows 11 5165 * don't comply with this requirement which is not enforced in 5166 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5167 * workaround, as misbehaving guests will panic on VM-Fail. 5168 * Note, enlightened VMCS is incompatible with shadow VMCS so 5169 * all VMREADs from L2 should go to L1. 5170 */ 5171 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5172 return nested_vmx_failInvalid(vcpu); 5173 5174 offset = evmcs_field_offset(field, NULL); 5175 if (offset < 0) 5176 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5177 5178 /* Read the field, zero-extended to a u64 value */ 5179 value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); 5180 } 5181 5182 /* 5183 * Now copy part of this value to register or memory, as requested. 5184 * Note that the number of bits actually copied is 32 or 64 depending 5185 * on the guest's mode (32 or 64 bit), not on the given field's length. 5186 */ 5187 if (instr_info & BIT(10)) { 5188 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5189 } else { 5190 len = is_64_bit_mode(vcpu) ? 8 : 4; 5191 if (get_vmx_mem_address(vcpu, exit_qualification, 5192 instr_info, true, len, &gva)) 5193 return 1; 5194 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5195 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5196 if (r != X86EMUL_CONTINUE) 5197 return kvm_handle_memory_failure(vcpu, r, &e); 5198 } 5199 5200 return nested_vmx_succeed(vcpu); 5201 } 5202 5203 static bool is_shadow_field_rw(unsigned long field) 5204 { 5205 switch (field) { 5206 #define SHADOW_FIELD_RW(x, y) case x: 5207 #include "vmcs_shadow_fields.h" 5208 return true; 5209 default: 5210 break; 5211 } 5212 return false; 5213 } 5214 5215 static bool is_shadow_field_ro(unsigned long field) 5216 { 5217 switch (field) { 5218 #define SHADOW_FIELD_RO(x, y) case x: 5219 #include "vmcs_shadow_fields.h" 5220 return true; 5221 default: 5222 break; 5223 } 5224 return false; 5225 } 5226 5227 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5228 { 5229 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5230 : get_vmcs12(vcpu); 5231 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5232 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5233 struct vcpu_vmx *vmx = to_vmx(vcpu); 5234 struct x86_exception e; 5235 unsigned long field; 5236 short offset; 5237 gva_t gva; 5238 int len, r; 5239 5240 /* 5241 * The value to write might be 32 or 64 bits, depending on L1's long 5242 * mode, and eventually we need to write that into a field of several 5243 * possible lengths. The code below first zero-extends the value to 64 5244 * bit (value), and then copies only the appropriate number of 5245 * bits into the vmcs12 field. 5246 */ 5247 u64 value = 0; 5248 5249 if (!nested_vmx_check_permission(vcpu)) 5250 return 1; 5251 5252 /* 5253 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5254 * any VMWRITE sets the ALU flags for VMfailInvalid. 5255 */ 5256 if (vmx->nested.current_vmptr == INVALID_GPA || 5257 (is_guest_mode(vcpu) && 5258 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5259 return nested_vmx_failInvalid(vcpu); 5260 5261 if (instr_info & BIT(10)) 5262 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5263 else { 5264 len = is_64_bit_mode(vcpu) ? 8 : 4; 5265 if (get_vmx_mem_address(vcpu, exit_qualification, 5266 instr_info, false, len, &gva)) 5267 return 1; 5268 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5269 if (r != X86EMUL_CONTINUE) 5270 return kvm_handle_memory_failure(vcpu, r, &e); 5271 } 5272 5273 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5274 5275 offset = get_vmcs12_field_offset(field); 5276 if (offset < 0) 5277 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5278 5279 /* 5280 * If the vCPU supports "VMWRITE to any supported field in the 5281 * VMCS," then the "read-only" fields are actually read/write. 5282 */ 5283 if (vmcs_field_readonly(field) && 5284 !nested_cpu_has_vmwrite_any_field(vcpu)) 5285 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5286 5287 /* 5288 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5289 * vmcs12, else we may crush a field or consume a stale value. 5290 */ 5291 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5292 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5293 5294 /* 5295 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5296 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5297 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5298 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5299 * from L1 will return a different value than VMREAD from L2 (L1 sees 5300 * the stripped down value, L2 sees the full value as stored by KVM). 5301 */ 5302 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5303 value &= 0x1f0ff; 5304 5305 vmcs12_write_any(vmcs12, field, offset, value); 5306 5307 /* 5308 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5309 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5310 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5311 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5312 */ 5313 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5314 /* 5315 * L1 can read these fields without exiting, ensure the 5316 * shadow VMCS is up-to-date. 5317 */ 5318 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5319 preempt_disable(); 5320 vmcs_load(vmx->vmcs01.shadow_vmcs); 5321 5322 __vmcs_writel(field, value); 5323 5324 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5325 vmcs_load(vmx->loaded_vmcs->vmcs); 5326 preempt_enable(); 5327 } 5328 vmx->nested.dirty_vmcs12 = true; 5329 } 5330 5331 return nested_vmx_succeed(vcpu); 5332 } 5333 5334 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5335 { 5336 vmx->nested.current_vmptr = vmptr; 5337 if (enable_shadow_vmcs) { 5338 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5339 vmcs_write64(VMCS_LINK_POINTER, 5340 __pa(vmx->vmcs01.shadow_vmcs)); 5341 vmx->nested.need_vmcs12_to_shadow_sync = true; 5342 } 5343 vmx->nested.dirty_vmcs12 = true; 5344 vmx->nested.force_msr_bitmap_recalc = true; 5345 } 5346 5347 /* Emulate the VMPTRLD instruction */ 5348 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5349 { 5350 struct vcpu_vmx *vmx = to_vmx(vcpu); 5351 gpa_t vmptr; 5352 int r; 5353 5354 if (!nested_vmx_check_permission(vcpu)) 5355 return 1; 5356 5357 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5358 return r; 5359 5360 if (!page_address_valid(vcpu, vmptr)) 5361 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5362 5363 if (vmptr == vmx->nested.vmxon_ptr) 5364 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5365 5366 /* Forbid normal VMPTRLD if Enlightened version was used */ 5367 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 5368 return 1; 5369 5370 if (vmx->nested.current_vmptr != vmptr) { 5371 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5372 struct vmcs_hdr hdr; 5373 5374 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5375 /* 5376 * Reads from an unbacked page return all 1s, 5377 * which means that the 32 bits located at the 5378 * given physical address won't match the required 5379 * VMCS12_REVISION identifier. 5380 */ 5381 return nested_vmx_fail(vcpu, 5382 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5383 } 5384 5385 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5386 offsetof(struct vmcs12, hdr), 5387 sizeof(hdr))) { 5388 return nested_vmx_fail(vcpu, 5389 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5390 } 5391 5392 if (hdr.revision_id != VMCS12_REVISION || 5393 (hdr.shadow_vmcs && 5394 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5395 return nested_vmx_fail(vcpu, 5396 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5397 } 5398 5399 nested_release_vmcs12(vcpu); 5400 5401 /* 5402 * Load VMCS12 from guest memory since it is not already 5403 * cached. 5404 */ 5405 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5406 VMCS12_SIZE)) { 5407 return nested_vmx_fail(vcpu, 5408 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5409 } 5410 5411 set_current_vmptr(vmx, vmptr); 5412 } 5413 5414 return nested_vmx_succeed(vcpu); 5415 } 5416 5417 /* Emulate the VMPTRST instruction */ 5418 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5419 { 5420 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5421 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5422 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5423 struct x86_exception e; 5424 gva_t gva; 5425 int r; 5426 5427 if (!nested_vmx_check_permission(vcpu)) 5428 return 1; 5429 5430 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) 5431 return 1; 5432 5433 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5434 true, sizeof(gpa_t), &gva)) 5435 return 1; 5436 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5437 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5438 sizeof(gpa_t), &e); 5439 if (r != X86EMUL_CONTINUE) 5440 return kvm_handle_memory_failure(vcpu, r, &e); 5441 5442 return nested_vmx_succeed(vcpu); 5443 } 5444 5445 /* Emulate the INVEPT instruction */ 5446 static int handle_invept(struct kvm_vcpu *vcpu) 5447 { 5448 struct vcpu_vmx *vmx = to_vmx(vcpu); 5449 u32 vmx_instruction_info, types; 5450 unsigned long type, roots_to_free; 5451 struct kvm_mmu *mmu; 5452 gva_t gva; 5453 struct x86_exception e; 5454 struct { 5455 u64 eptp, gpa; 5456 } operand; 5457 int i, r, gpr_index; 5458 5459 if (!(vmx->nested.msrs.secondary_ctls_high & 5460 SECONDARY_EXEC_ENABLE_EPT) || 5461 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5462 kvm_queue_exception(vcpu, UD_VECTOR); 5463 return 1; 5464 } 5465 5466 if (!nested_vmx_check_permission(vcpu)) 5467 return 1; 5468 5469 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5470 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5471 type = kvm_register_read(vcpu, gpr_index); 5472 5473 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5474 5475 if (type >= 32 || !(types & (1 << type))) 5476 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5477 5478 /* According to the Intel VMX instruction reference, the memory 5479 * operand is read even if it isn't needed (e.g., for type==global) 5480 */ 5481 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5482 vmx_instruction_info, false, sizeof(operand), &gva)) 5483 return 1; 5484 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5485 if (r != X86EMUL_CONTINUE) 5486 return kvm_handle_memory_failure(vcpu, r, &e); 5487 5488 /* 5489 * Nested EPT roots are always held through guest_mmu, 5490 * not root_mmu. 5491 */ 5492 mmu = &vcpu->arch.guest_mmu; 5493 5494 switch (type) { 5495 case VMX_EPT_EXTENT_CONTEXT: 5496 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5497 return nested_vmx_fail(vcpu, 5498 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5499 5500 roots_to_free = 0; 5501 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5502 operand.eptp)) 5503 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5504 5505 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5506 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5507 mmu->prev_roots[i].pgd, 5508 operand.eptp)) 5509 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5510 } 5511 break; 5512 case VMX_EPT_EXTENT_GLOBAL: 5513 roots_to_free = KVM_MMU_ROOTS_ALL; 5514 break; 5515 default: 5516 BUG(); 5517 break; 5518 } 5519 5520 if (roots_to_free) 5521 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5522 5523 return nested_vmx_succeed(vcpu); 5524 } 5525 5526 static int handle_invvpid(struct kvm_vcpu *vcpu) 5527 { 5528 struct vcpu_vmx *vmx = to_vmx(vcpu); 5529 u32 vmx_instruction_info; 5530 unsigned long type, types; 5531 gva_t gva; 5532 struct x86_exception e; 5533 struct { 5534 u64 vpid; 5535 u64 gla; 5536 } operand; 5537 u16 vpid02; 5538 int r, gpr_index; 5539 5540 if (!(vmx->nested.msrs.secondary_ctls_high & 5541 SECONDARY_EXEC_ENABLE_VPID) || 5542 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5543 kvm_queue_exception(vcpu, UD_VECTOR); 5544 return 1; 5545 } 5546 5547 if (!nested_vmx_check_permission(vcpu)) 5548 return 1; 5549 5550 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5551 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5552 type = kvm_register_read(vcpu, gpr_index); 5553 5554 types = (vmx->nested.msrs.vpid_caps & 5555 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5556 5557 if (type >= 32 || !(types & (1 << type))) 5558 return nested_vmx_fail(vcpu, 5559 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5560 5561 /* according to the intel vmx instruction reference, the memory 5562 * operand is read even if it isn't needed (e.g., for type==global) 5563 */ 5564 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5565 vmx_instruction_info, false, sizeof(operand), &gva)) 5566 return 1; 5567 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5568 if (r != X86EMUL_CONTINUE) 5569 return kvm_handle_memory_failure(vcpu, r, &e); 5570 5571 if (operand.vpid >> 16) 5572 return nested_vmx_fail(vcpu, 5573 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5574 5575 vpid02 = nested_get_vpid02(vcpu); 5576 switch (type) { 5577 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5578 if (!operand.vpid || 5579 is_noncanonical_address(operand.gla, vcpu)) 5580 return nested_vmx_fail(vcpu, 5581 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5582 vpid_sync_vcpu_addr(vpid02, operand.gla); 5583 break; 5584 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5585 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5586 if (!operand.vpid) 5587 return nested_vmx_fail(vcpu, 5588 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5589 vpid_sync_context(vpid02); 5590 break; 5591 case VMX_VPID_EXTENT_ALL_CONTEXT: 5592 vpid_sync_context(vpid02); 5593 break; 5594 default: 5595 WARN_ON_ONCE(1); 5596 return kvm_skip_emulated_instruction(vcpu); 5597 } 5598 5599 /* 5600 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5601 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5602 * roots as VPIDs are not tracked in the MMU role. 5603 * 5604 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5605 * an MMU when EPT is disabled. 5606 * 5607 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5608 */ 5609 if (!enable_ept) 5610 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 5611 5612 return nested_vmx_succeed(vcpu); 5613 } 5614 5615 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5616 struct vmcs12 *vmcs12) 5617 { 5618 u32 index = kvm_rcx_read(vcpu); 5619 u64 new_eptp; 5620 5621 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5622 return 1; 5623 if (index >= VMFUNC_EPTP_ENTRIES) 5624 return 1; 5625 5626 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5627 &new_eptp, index * 8, 8)) 5628 return 1; 5629 5630 /* 5631 * If the (L2) guest does a vmfunc to the currently 5632 * active ept pointer, we don't have to do anything else 5633 */ 5634 if (vmcs12->ept_pointer != new_eptp) { 5635 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5636 return 1; 5637 5638 vmcs12->ept_pointer = new_eptp; 5639 nested_ept_new_eptp(vcpu); 5640 5641 if (!nested_cpu_has_vpid(vmcs12)) 5642 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5643 } 5644 5645 return 0; 5646 } 5647 5648 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5649 { 5650 struct vcpu_vmx *vmx = to_vmx(vcpu); 5651 struct vmcs12 *vmcs12; 5652 u32 function = kvm_rax_read(vcpu); 5653 5654 /* 5655 * VMFUNC is only supported for nested guests, but we always enable the 5656 * secondary control for simplicity; for non-nested mode, fake that we 5657 * didn't by injecting #UD. 5658 */ 5659 if (!is_guest_mode(vcpu)) { 5660 kvm_queue_exception(vcpu, UD_VECTOR); 5661 return 1; 5662 } 5663 5664 vmcs12 = get_vmcs12(vcpu); 5665 5666 /* 5667 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5668 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5669 */ 5670 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5671 kvm_queue_exception(vcpu, UD_VECTOR); 5672 return 1; 5673 } 5674 5675 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5676 goto fail; 5677 5678 switch (function) { 5679 case 0: 5680 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5681 goto fail; 5682 break; 5683 default: 5684 goto fail; 5685 } 5686 return kvm_skip_emulated_instruction(vcpu); 5687 5688 fail: 5689 /* 5690 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5691 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5692 * EXIT_REASON_VMFUNC as the exit reason. 5693 */ 5694 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5695 vmx_get_intr_info(vcpu), 5696 vmx_get_exit_qual(vcpu)); 5697 return 1; 5698 } 5699 5700 /* 5701 * Return true if an IO instruction with the specified port and size should cause 5702 * a VM-exit into L1. 5703 */ 5704 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5705 int size) 5706 { 5707 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5708 gpa_t bitmap, last_bitmap; 5709 u8 b; 5710 5711 last_bitmap = INVALID_GPA; 5712 b = -1; 5713 5714 while (size > 0) { 5715 if (port < 0x8000) 5716 bitmap = vmcs12->io_bitmap_a; 5717 else if (port < 0x10000) 5718 bitmap = vmcs12->io_bitmap_b; 5719 else 5720 return true; 5721 bitmap += (port & 0x7fff) / 8; 5722 5723 if (last_bitmap != bitmap) 5724 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5725 return true; 5726 if (b & (1 << (port & 7))) 5727 return true; 5728 5729 port++; 5730 size--; 5731 last_bitmap = bitmap; 5732 } 5733 5734 return false; 5735 } 5736 5737 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5738 struct vmcs12 *vmcs12) 5739 { 5740 unsigned long exit_qualification; 5741 unsigned short port; 5742 int size; 5743 5744 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5745 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5746 5747 exit_qualification = vmx_get_exit_qual(vcpu); 5748 5749 port = exit_qualification >> 16; 5750 size = (exit_qualification & 7) + 1; 5751 5752 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5753 } 5754 5755 /* 5756 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5757 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5758 * disinterest in the current event (read or write a specific MSR) by using an 5759 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5760 */ 5761 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5762 struct vmcs12 *vmcs12, 5763 union vmx_exit_reason exit_reason) 5764 { 5765 u32 msr_index = kvm_rcx_read(vcpu); 5766 gpa_t bitmap; 5767 5768 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5769 return true; 5770 5771 /* 5772 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5773 * for the four combinations of read/write and low/high MSR numbers. 5774 * First we need to figure out which of the four to use: 5775 */ 5776 bitmap = vmcs12->msr_bitmap; 5777 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5778 bitmap += 2048; 5779 if (msr_index >= 0xc0000000) { 5780 msr_index -= 0xc0000000; 5781 bitmap += 1024; 5782 } 5783 5784 /* Then read the msr_index'th bit from this bitmap: */ 5785 if (msr_index < 1024*8) { 5786 unsigned char b; 5787 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5788 return true; 5789 return 1 & (b >> (msr_index & 7)); 5790 } else 5791 return true; /* let L1 handle the wrong parameter */ 5792 } 5793 5794 /* 5795 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5796 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5797 * intercept (via guest_host_mask etc.) the current event. 5798 */ 5799 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5800 struct vmcs12 *vmcs12) 5801 { 5802 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5803 int cr = exit_qualification & 15; 5804 int reg; 5805 unsigned long val; 5806 5807 switch ((exit_qualification >> 4) & 3) { 5808 case 0: /* mov to cr */ 5809 reg = (exit_qualification >> 8) & 15; 5810 val = kvm_register_read(vcpu, reg); 5811 switch (cr) { 5812 case 0: 5813 if (vmcs12->cr0_guest_host_mask & 5814 (val ^ vmcs12->cr0_read_shadow)) 5815 return true; 5816 break; 5817 case 3: 5818 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5819 return true; 5820 break; 5821 case 4: 5822 if (vmcs12->cr4_guest_host_mask & 5823 (vmcs12->cr4_read_shadow ^ val)) 5824 return true; 5825 break; 5826 case 8: 5827 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5828 return true; 5829 break; 5830 } 5831 break; 5832 case 2: /* clts */ 5833 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5834 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5835 return true; 5836 break; 5837 case 1: /* mov from cr */ 5838 switch (cr) { 5839 case 3: 5840 if (vmcs12->cpu_based_vm_exec_control & 5841 CPU_BASED_CR3_STORE_EXITING) 5842 return true; 5843 break; 5844 case 8: 5845 if (vmcs12->cpu_based_vm_exec_control & 5846 CPU_BASED_CR8_STORE_EXITING) 5847 return true; 5848 break; 5849 } 5850 break; 5851 case 3: /* lmsw */ 5852 /* 5853 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5854 * cr0. Other attempted changes are ignored, with no exit. 5855 */ 5856 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5857 if (vmcs12->cr0_guest_host_mask & 0xe & 5858 (val ^ vmcs12->cr0_read_shadow)) 5859 return true; 5860 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5861 !(vmcs12->cr0_read_shadow & 0x1) && 5862 (val & 0x1)) 5863 return true; 5864 break; 5865 } 5866 return false; 5867 } 5868 5869 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 5870 struct vmcs12 *vmcs12) 5871 { 5872 u32 encls_leaf; 5873 5874 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 5875 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 5876 return false; 5877 5878 encls_leaf = kvm_rax_read(vcpu); 5879 if (encls_leaf > 62) 5880 encls_leaf = 63; 5881 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 5882 } 5883 5884 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5885 struct vmcs12 *vmcs12, gpa_t bitmap) 5886 { 5887 u32 vmx_instruction_info; 5888 unsigned long field; 5889 u8 b; 5890 5891 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5892 return true; 5893 5894 /* Decode instruction info and find the field to access */ 5895 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5896 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5897 5898 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5899 if (field >> 15) 5900 return true; 5901 5902 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5903 return true; 5904 5905 return 1 & (b >> (field & 7)); 5906 } 5907 5908 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 5909 { 5910 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 5911 5912 if (nested_cpu_has_mtf(vmcs12)) 5913 return true; 5914 5915 /* 5916 * An MTF VM-exit may be injected into the guest by setting the 5917 * interruption-type to 7 (other event) and the vector field to 0. Such 5918 * is the case regardless of the 'monitor trap flag' VM-execution 5919 * control. 5920 */ 5921 return entry_intr_info == (INTR_INFO_VALID_MASK 5922 | INTR_TYPE_OTHER_EVENT); 5923 } 5924 5925 /* 5926 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 5927 * L1 wants the exit. Only call this when in is_guest_mode (L2). 5928 */ 5929 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 5930 union vmx_exit_reason exit_reason) 5931 { 5932 u32 intr_info; 5933 5934 switch ((u16)exit_reason.basic) { 5935 case EXIT_REASON_EXCEPTION_NMI: 5936 intr_info = vmx_get_intr_info(vcpu); 5937 if (is_nmi(intr_info)) 5938 return true; 5939 else if (is_page_fault(intr_info)) 5940 return vcpu->arch.apf.host_apf_flags || 5941 vmx_need_pf_intercept(vcpu); 5942 else if (is_debug(intr_info) && 5943 vcpu->guest_debug & 5944 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5945 return true; 5946 else if (is_breakpoint(intr_info) && 5947 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5948 return true; 5949 else if (is_alignment_check(intr_info) && 5950 !vmx_guest_inject_ac(vcpu)) 5951 return true; 5952 return false; 5953 case EXIT_REASON_EXTERNAL_INTERRUPT: 5954 return true; 5955 case EXIT_REASON_MCE_DURING_VMENTRY: 5956 return true; 5957 case EXIT_REASON_EPT_VIOLATION: 5958 /* 5959 * L0 always deals with the EPT violation. If nested EPT is 5960 * used, and the nested mmu code discovers that the address is 5961 * missing in the guest EPT table (EPT12), the EPT violation 5962 * will be injected with nested_ept_inject_page_fault() 5963 */ 5964 return true; 5965 case EXIT_REASON_EPT_MISCONFIG: 5966 /* 5967 * L2 never uses directly L1's EPT, but rather L0's own EPT 5968 * table (shadow on EPT) or a merged EPT table that L0 built 5969 * (EPT on EPT). So any problems with the structure of the 5970 * table is L0's fault. 5971 */ 5972 return true; 5973 case EXIT_REASON_PREEMPTION_TIMER: 5974 return true; 5975 case EXIT_REASON_PML_FULL: 5976 /* 5977 * PML is emulated for an L1 VMM and should never be enabled in 5978 * vmcs02, always "handle" PML_FULL by exiting to userspace. 5979 */ 5980 return true; 5981 case EXIT_REASON_VMFUNC: 5982 /* VM functions are emulated through L2->L0 vmexits. */ 5983 return true; 5984 case EXIT_REASON_BUS_LOCK: 5985 /* 5986 * At present, bus lock VM exit is never exposed to L1. 5987 * Handle L2's bus locks in L0 directly. 5988 */ 5989 return true; 5990 default: 5991 break; 5992 } 5993 return false; 5994 } 5995 5996 /* 5997 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 5998 * is_guest_mode (L2). 5999 */ 6000 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6001 union vmx_exit_reason exit_reason) 6002 { 6003 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6004 u32 intr_info; 6005 6006 switch ((u16)exit_reason.basic) { 6007 case EXIT_REASON_EXCEPTION_NMI: 6008 intr_info = vmx_get_intr_info(vcpu); 6009 if (is_nmi(intr_info)) 6010 return true; 6011 else if (is_page_fault(intr_info)) 6012 return true; 6013 return vmcs12->exception_bitmap & 6014 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6015 case EXIT_REASON_EXTERNAL_INTERRUPT: 6016 return nested_exit_on_intr(vcpu); 6017 case EXIT_REASON_TRIPLE_FAULT: 6018 return true; 6019 case EXIT_REASON_INTERRUPT_WINDOW: 6020 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6021 case EXIT_REASON_NMI_WINDOW: 6022 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6023 case EXIT_REASON_TASK_SWITCH: 6024 return true; 6025 case EXIT_REASON_CPUID: 6026 return true; 6027 case EXIT_REASON_HLT: 6028 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6029 case EXIT_REASON_INVD: 6030 return true; 6031 case EXIT_REASON_INVLPG: 6032 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6033 case EXIT_REASON_RDPMC: 6034 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6035 case EXIT_REASON_RDRAND: 6036 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6037 case EXIT_REASON_RDSEED: 6038 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6039 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6040 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6041 case EXIT_REASON_VMREAD: 6042 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6043 vmcs12->vmread_bitmap); 6044 case EXIT_REASON_VMWRITE: 6045 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6046 vmcs12->vmwrite_bitmap); 6047 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6048 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6049 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6050 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6051 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6052 /* 6053 * VMX instructions trap unconditionally. This allows L1 to 6054 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6055 */ 6056 return true; 6057 case EXIT_REASON_CR_ACCESS: 6058 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6059 case EXIT_REASON_DR_ACCESS: 6060 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6061 case EXIT_REASON_IO_INSTRUCTION: 6062 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6063 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6064 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6065 case EXIT_REASON_MSR_READ: 6066 case EXIT_REASON_MSR_WRITE: 6067 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6068 case EXIT_REASON_INVALID_STATE: 6069 return true; 6070 case EXIT_REASON_MWAIT_INSTRUCTION: 6071 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6072 case EXIT_REASON_MONITOR_TRAP_FLAG: 6073 return nested_vmx_exit_handled_mtf(vmcs12); 6074 case EXIT_REASON_MONITOR_INSTRUCTION: 6075 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6076 case EXIT_REASON_PAUSE_INSTRUCTION: 6077 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6078 nested_cpu_has2(vmcs12, 6079 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6080 case EXIT_REASON_MCE_DURING_VMENTRY: 6081 return true; 6082 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6083 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6084 case EXIT_REASON_APIC_ACCESS: 6085 case EXIT_REASON_APIC_WRITE: 6086 case EXIT_REASON_EOI_INDUCED: 6087 /* 6088 * The controls for "virtualize APIC accesses," "APIC- 6089 * register virtualization," and "virtual-interrupt 6090 * delivery" only come from vmcs12. 6091 */ 6092 return true; 6093 case EXIT_REASON_INVPCID: 6094 return 6095 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6096 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6097 case EXIT_REASON_WBINVD: 6098 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6099 case EXIT_REASON_XSETBV: 6100 return true; 6101 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6102 /* 6103 * This should never happen, since it is not possible to 6104 * set XSS to a non-zero value---neither in L1 nor in L2. 6105 * If if it were, XSS would have to be checked against 6106 * the XSS exit bitmap in vmcs12. 6107 */ 6108 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 6109 case EXIT_REASON_UMWAIT: 6110 case EXIT_REASON_TPAUSE: 6111 return nested_cpu_has2(vmcs12, 6112 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6113 case EXIT_REASON_ENCLS: 6114 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6115 default: 6116 return true; 6117 } 6118 } 6119 6120 /* 6121 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6122 * reflected into L1. 6123 */ 6124 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6125 { 6126 struct vcpu_vmx *vmx = to_vmx(vcpu); 6127 union vmx_exit_reason exit_reason = vmx->exit_reason; 6128 unsigned long exit_qual; 6129 u32 exit_intr_info; 6130 6131 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6132 6133 /* 6134 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6135 * has already loaded L2's state. 6136 */ 6137 if (unlikely(vmx->fail)) { 6138 trace_kvm_nested_vmenter_failed( 6139 "hardware VM-instruction error: ", 6140 vmcs_read32(VM_INSTRUCTION_ERROR)); 6141 exit_intr_info = 0; 6142 exit_qual = 0; 6143 goto reflect_vmexit; 6144 } 6145 6146 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6147 6148 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6149 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6150 return false; 6151 6152 /* If L1 doesn't want the exit, handle it in L0. */ 6153 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6154 return false; 6155 6156 /* 6157 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6158 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6159 * need to be synthesized by querying the in-kernel LAPIC, but external 6160 * interrupts are never reflected to L1 so it's a non-issue. 6161 */ 6162 exit_intr_info = vmx_get_intr_info(vcpu); 6163 if (is_exception_with_error_code(exit_intr_info)) { 6164 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6165 6166 vmcs12->vm_exit_intr_error_code = 6167 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6168 } 6169 exit_qual = vmx_get_exit_qual(vcpu); 6170 6171 reflect_vmexit: 6172 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6173 return true; 6174 } 6175 6176 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6177 struct kvm_nested_state __user *user_kvm_nested_state, 6178 u32 user_data_size) 6179 { 6180 struct vcpu_vmx *vmx; 6181 struct vmcs12 *vmcs12; 6182 struct kvm_nested_state kvm_state = { 6183 .flags = 0, 6184 .format = KVM_STATE_NESTED_FORMAT_VMX, 6185 .size = sizeof(kvm_state), 6186 .hdr.vmx.flags = 0, 6187 .hdr.vmx.vmxon_pa = INVALID_GPA, 6188 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6189 .hdr.vmx.preemption_timer_deadline = 0, 6190 }; 6191 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6192 &user_kvm_nested_state->data.vmx[0]; 6193 6194 if (!vcpu) 6195 return kvm_state.size + sizeof(*user_vmx_nested_state); 6196 6197 vmx = to_vmx(vcpu); 6198 vmcs12 = get_vmcs12(vcpu); 6199 6200 if (nested_vmx_allowed(vcpu) && 6201 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6202 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6203 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6204 6205 if (vmx_has_valid_vmcs12(vcpu)) { 6206 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6207 6208 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6209 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 6210 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6211 6212 if (is_guest_mode(vcpu) && 6213 nested_cpu_has_shadow_vmcs(vmcs12) && 6214 vmcs12->vmcs_link_pointer != INVALID_GPA) 6215 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6216 } 6217 6218 if (vmx->nested.smm.vmxon) 6219 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6220 6221 if (vmx->nested.smm.guest_mode) 6222 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6223 6224 if (is_guest_mode(vcpu)) { 6225 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6226 6227 if (vmx->nested.nested_run_pending) 6228 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6229 6230 if (vmx->nested.mtf_pending) 6231 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6232 6233 if (nested_cpu_has_preemption_timer(vmcs12) && 6234 vmx->nested.has_preemption_timer_deadline) { 6235 kvm_state.hdr.vmx.flags |= 6236 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6237 kvm_state.hdr.vmx.preemption_timer_deadline = 6238 vmx->nested.preemption_timer_deadline; 6239 } 6240 } 6241 } 6242 6243 if (user_data_size < kvm_state.size) 6244 goto out; 6245 6246 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6247 return -EFAULT; 6248 6249 if (!vmx_has_valid_vmcs12(vcpu)) 6250 goto out; 6251 6252 /* 6253 * When running L2, the authoritative vmcs12 state is in the 6254 * vmcs02. When running L1, the authoritative vmcs12 state is 6255 * in the shadow or enlightened vmcs linked to vmcs01, unless 6256 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6257 * vmcs12 state is in the vmcs12 already. 6258 */ 6259 if (is_guest_mode(vcpu)) { 6260 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6261 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6262 } else { 6263 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6264 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6265 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 6266 /* 6267 * L1 hypervisor is not obliged to keep eVMCS 6268 * clean fields data always up-to-date while 6269 * not in guest mode, 'hv_clean_fields' is only 6270 * supposed to be actual upon vmentry so we need 6271 * to ignore it here and do full copy. 6272 */ 6273 copy_enlightened_to_vmcs12(vmx, 0); 6274 else if (enable_shadow_vmcs) 6275 copy_shadow_to_vmcs12(vmx); 6276 } 6277 } 6278 6279 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6280 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6281 6282 /* 6283 * Copy over the full allocated size of vmcs12 rather than just the size 6284 * of the struct. 6285 */ 6286 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6287 return -EFAULT; 6288 6289 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6290 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6291 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6292 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6293 return -EFAULT; 6294 } 6295 out: 6296 return kvm_state.size; 6297 } 6298 6299 /* 6300 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 6301 */ 6302 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6303 { 6304 if (is_guest_mode(vcpu)) { 6305 to_vmx(vcpu)->nested.nested_run_pending = 0; 6306 nested_vmx_vmexit(vcpu, -1, 0, 0); 6307 } 6308 free_nested(vcpu); 6309 } 6310 6311 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6312 struct kvm_nested_state __user *user_kvm_nested_state, 6313 struct kvm_nested_state *kvm_state) 6314 { 6315 struct vcpu_vmx *vmx = to_vmx(vcpu); 6316 struct vmcs12 *vmcs12; 6317 enum vm_entry_failure_code ignored; 6318 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6319 &user_kvm_nested_state->data.vmx[0]; 6320 int ret; 6321 6322 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6323 return -EINVAL; 6324 6325 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6326 if (kvm_state->hdr.vmx.smm.flags) 6327 return -EINVAL; 6328 6329 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6330 return -EINVAL; 6331 6332 /* 6333 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6334 * enable eVMCS capability on vCPU. However, since then 6335 * code was changed such that flag signals vmcs12 should 6336 * be copied into eVMCS in guest memory. 6337 * 6338 * To preserve backwards compatability, allow user 6339 * to set this flag even when there is no VMXON region. 6340 */ 6341 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6342 return -EINVAL; 6343 } else { 6344 if (!nested_vmx_allowed(vcpu)) 6345 return -EINVAL; 6346 6347 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6348 return -EINVAL; 6349 } 6350 6351 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6352 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6353 return -EINVAL; 6354 6355 if (kvm_state->hdr.vmx.smm.flags & 6356 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6357 return -EINVAL; 6358 6359 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6360 return -EINVAL; 6361 6362 /* 6363 * SMM temporarily disables VMX, so we cannot be in guest mode, 6364 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6365 * must be zero. 6366 */ 6367 if (is_smm(vcpu) ? 6368 (kvm_state->flags & 6369 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6370 : kvm_state->hdr.vmx.smm.flags) 6371 return -EINVAL; 6372 6373 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6374 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6375 return -EINVAL; 6376 6377 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6378 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6379 return -EINVAL; 6380 6381 vmx_leave_nested(vcpu); 6382 6383 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6384 return 0; 6385 6386 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6387 ret = enter_vmx_operation(vcpu); 6388 if (ret) 6389 return ret; 6390 6391 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6392 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6393 /* See vmx_has_valid_vmcs12. */ 6394 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6395 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6396 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6397 return -EINVAL; 6398 else 6399 return 0; 6400 } 6401 6402 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6403 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6404 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6405 return -EINVAL; 6406 6407 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6408 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6409 /* 6410 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6411 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6412 * restored yet. EVMCS will be mapped from 6413 * nested_get_vmcs12_pages(). 6414 */ 6415 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6416 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6417 } else { 6418 return -EINVAL; 6419 } 6420 6421 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6422 vmx->nested.smm.vmxon = true; 6423 vmx->nested.vmxon = false; 6424 6425 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6426 vmx->nested.smm.guest_mode = true; 6427 } 6428 6429 vmcs12 = get_vmcs12(vcpu); 6430 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6431 return -EFAULT; 6432 6433 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6434 return -EINVAL; 6435 6436 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6437 return 0; 6438 6439 vmx->nested.nested_run_pending = 6440 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6441 6442 vmx->nested.mtf_pending = 6443 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6444 6445 ret = -EINVAL; 6446 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6447 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6448 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6449 6450 if (kvm_state->size < 6451 sizeof(*kvm_state) + 6452 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6453 goto error_guest_mode; 6454 6455 if (copy_from_user(shadow_vmcs12, 6456 user_vmx_nested_state->shadow_vmcs12, 6457 sizeof(*shadow_vmcs12))) { 6458 ret = -EFAULT; 6459 goto error_guest_mode; 6460 } 6461 6462 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6463 !shadow_vmcs12->hdr.shadow_vmcs) 6464 goto error_guest_mode; 6465 } 6466 6467 vmx->nested.has_preemption_timer_deadline = false; 6468 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6469 vmx->nested.has_preemption_timer_deadline = true; 6470 vmx->nested.preemption_timer_deadline = 6471 kvm_state->hdr.vmx.preemption_timer_deadline; 6472 } 6473 6474 if (nested_vmx_check_controls(vcpu, vmcs12) || 6475 nested_vmx_check_host_state(vcpu, vmcs12) || 6476 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6477 goto error_guest_mode; 6478 6479 vmx->nested.dirty_vmcs12 = true; 6480 vmx->nested.force_msr_bitmap_recalc = true; 6481 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6482 if (ret) 6483 goto error_guest_mode; 6484 6485 return 0; 6486 6487 error_guest_mode: 6488 vmx->nested.nested_run_pending = 0; 6489 return ret; 6490 } 6491 6492 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6493 { 6494 if (enable_shadow_vmcs) { 6495 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6496 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6497 } 6498 } 6499 6500 /* 6501 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6502 * that madness to get the encoding for comparison. 6503 */ 6504 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6505 6506 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6507 { 6508 /* 6509 * Note these are the so called "index" of the VMCS field encoding, not 6510 * the index into vmcs12. 6511 */ 6512 unsigned int max_idx, idx; 6513 int i; 6514 6515 /* 6516 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6517 * vmcs12, regardless of whether or not the associated feature is 6518 * exposed to L1. Simply find the field with the highest index. 6519 */ 6520 max_idx = 0; 6521 for (i = 0; i < nr_vmcs12_fields; i++) { 6522 /* The vmcs12 table is very, very sparsely populated. */ 6523 if (!vmcs12_field_offsets[i]) 6524 continue; 6525 6526 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6527 if (idx > max_idx) 6528 max_idx = idx; 6529 } 6530 6531 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6532 } 6533 6534 /* 6535 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6536 * returned for the various VMX controls MSRs when nested VMX is enabled. 6537 * The same values should also be used to verify that vmcs12 control fields are 6538 * valid during nested entry from L1 to L2. 6539 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6540 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6541 * bit in the high half is on if the corresponding bit in the control field 6542 * may be on. See also vmx_control_verify(). 6543 */ 6544 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) 6545 { 6546 /* 6547 * Note that as a general rule, the high half of the MSRs (bits in 6548 * the control fields which may be 1) should be initialized by the 6549 * intersection of the underlying hardware's MSR (i.e., features which 6550 * can be supported) and the list of features we want to expose - 6551 * because they are known to be properly supported in our code. 6552 * Also, usually, the low half of the MSRs (bits which must be 1) can 6553 * be set to 0, meaning that L1 may turn off any of these bits. The 6554 * reason is that if one of these bits is necessary, it will appear 6555 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6556 * fields of vmcs01 and vmcs02, will turn these bits off - and 6557 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6558 * These rules have exceptions below. 6559 */ 6560 6561 /* pin-based controls */ 6562 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 6563 msrs->pinbased_ctls_low, 6564 msrs->pinbased_ctls_high); 6565 msrs->pinbased_ctls_low |= 6566 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6567 msrs->pinbased_ctls_high &= 6568 PIN_BASED_EXT_INTR_MASK | 6569 PIN_BASED_NMI_EXITING | 6570 PIN_BASED_VIRTUAL_NMIS | 6571 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6572 msrs->pinbased_ctls_high |= 6573 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6574 PIN_BASED_VMX_PREEMPTION_TIMER; 6575 6576 /* exit controls */ 6577 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 6578 msrs->exit_ctls_low, 6579 msrs->exit_ctls_high); 6580 msrs->exit_ctls_low = 6581 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6582 6583 msrs->exit_ctls_high &= 6584 #ifdef CONFIG_X86_64 6585 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6586 #endif 6587 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6588 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6589 msrs->exit_ctls_high |= 6590 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6591 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6592 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 6593 6594 /* We support free control of debug control saving. */ 6595 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6596 6597 /* entry controls */ 6598 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 6599 msrs->entry_ctls_low, 6600 msrs->entry_ctls_high); 6601 msrs->entry_ctls_low = 6602 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6603 msrs->entry_ctls_high &= 6604 #ifdef CONFIG_X86_64 6605 VM_ENTRY_IA32E_MODE | 6606 #endif 6607 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 6608 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 6609 msrs->entry_ctls_high |= 6610 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6611 6612 /* We support free control of debug control loading. */ 6613 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6614 6615 /* cpu-based controls */ 6616 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6617 msrs->procbased_ctls_low, 6618 msrs->procbased_ctls_high); 6619 msrs->procbased_ctls_low = 6620 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6621 msrs->procbased_ctls_high &= 6622 CPU_BASED_INTR_WINDOW_EXITING | 6623 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6624 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6625 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6626 CPU_BASED_CR3_STORE_EXITING | 6627 #ifdef CONFIG_X86_64 6628 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6629 #endif 6630 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6631 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6632 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6633 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6634 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6635 /* 6636 * We can allow some features even when not supported by the 6637 * hardware. For example, L1 can specify an MSR bitmap - and we 6638 * can use it to avoid exits to L1 - even when L0 runs L2 6639 * without MSR bitmaps. 6640 */ 6641 msrs->procbased_ctls_high |= 6642 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6643 CPU_BASED_USE_MSR_BITMAPS; 6644 6645 /* We support free control of CR3 access interception. */ 6646 msrs->procbased_ctls_low &= 6647 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6648 6649 /* 6650 * secondary cpu-based controls. Do not include those that 6651 * depend on CPUID bits, they are added later by 6652 * vmx_vcpu_after_set_cpuid. 6653 */ 6654 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6655 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6656 msrs->secondary_ctls_low, 6657 msrs->secondary_ctls_high); 6658 6659 msrs->secondary_ctls_low = 0; 6660 msrs->secondary_ctls_high &= 6661 SECONDARY_EXEC_DESC | 6662 SECONDARY_EXEC_ENABLE_RDTSCP | 6663 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6664 SECONDARY_EXEC_WBINVD_EXITING | 6665 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6666 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6667 SECONDARY_EXEC_RDRAND_EXITING | 6668 SECONDARY_EXEC_ENABLE_INVPCID | 6669 SECONDARY_EXEC_RDSEED_EXITING | 6670 SECONDARY_EXEC_XSAVES | 6671 SECONDARY_EXEC_TSC_SCALING; 6672 6673 /* 6674 * We can emulate "VMCS shadowing," even if the hardware 6675 * doesn't support it. 6676 */ 6677 msrs->secondary_ctls_high |= 6678 SECONDARY_EXEC_SHADOW_VMCS; 6679 6680 if (enable_ept) { 6681 /* nested EPT: emulate EPT also to L1 */ 6682 msrs->secondary_ctls_high |= 6683 SECONDARY_EXEC_ENABLE_EPT; 6684 msrs->ept_caps = 6685 VMX_EPT_PAGE_WALK_4_BIT | 6686 VMX_EPT_PAGE_WALK_5_BIT | 6687 VMX_EPTP_WB_BIT | 6688 VMX_EPT_INVEPT_BIT | 6689 VMX_EPT_EXECUTE_ONLY_BIT; 6690 6691 msrs->ept_caps &= ept_caps; 6692 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6693 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6694 VMX_EPT_1GB_PAGE_BIT; 6695 if (enable_ept_ad_bits) { 6696 msrs->secondary_ctls_high |= 6697 SECONDARY_EXEC_ENABLE_PML; 6698 msrs->ept_caps |= VMX_EPT_AD_BIT; 6699 } 6700 } 6701 6702 if (cpu_has_vmx_vmfunc()) { 6703 msrs->secondary_ctls_high |= 6704 SECONDARY_EXEC_ENABLE_VMFUNC; 6705 /* 6706 * Advertise EPTP switching unconditionally 6707 * since we emulate it 6708 */ 6709 if (enable_ept) 6710 msrs->vmfunc_controls = 6711 VMX_VMFUNC_EPTP_SWITCHING; 6712 } 6713 6714 /* 6715 * Old versions of KVM use the single-context version without 6716 * checking for support, so declare that it is supported even 6717 * though it is treated as global context. The alternative is 6718 * not failing the single-context invvpid, and it is worse. 6719 */ 6720 if (enable_vpid) { 6721 msrs->secondary_ctls_high |= 6722 SECONDARY_EXEC_ENABLE_VPID; 6723 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6724 VMX_VPID_EXTENT_SUPPORTED_MASK; 6725 } 6726 6727 if (enable_unrestricted_guest) 6728 msrs->secondary_ctls_high |= 6729 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6730 6731 if (flexpriority_enabled) 6732 msrs->secondary_ctls_high |= 6733 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6734 6735 if (enable_sgx) 6736 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6737 6738 /* miscellaneous data */ 6739 rdmsr(MSR_IA32_VMX_MISC, 6740 msrs->misc_low, 6741 msrs->misc_high); 6742 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6743 msrs->misc_low |= 6744 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6745 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6746 VMX_MISC_ACTIVITY_HLT | 6747 VMX_MISC_ACTIVITY_WAIT_SIPI; 6748 msrs->misc_high = 0; 6749 6750 /* 6751 * This MSR reports some information about VMX support. We 6752 * should return information about the VMX we emulate for the 6753 * guest, and the VMCS structure we give it - not about the 6754 * VMX support of the underlying hardware. 6755 */ 6756 msrs->basic = 6757 VMCS12_REVISION | 6758 VMX_BASIC_TRUE_CTLS | 6759 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6760 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6761 6762 if (cpu_has_vmx_basic_inout()) 6763 msrs->basic |= VMX_BASIC_INOUT; 6764 6765 /* 6766 * These MSRs specify bits which the guest must keep fixed on 6767 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6768 * We picked the standard core2 setting. 6769 */ 6770 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6771 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6772 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6773 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6774 6775 /* These MSRs specify bits which the guest must keep fixed off. */ 6776 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6777 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6778 6779 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 6780 } 6781 6782 void nested_vmx_hardware_unsetup(void) 6783 { 6784 int i; 6785 6786 if (enable_shadow_vmcs) { 6787 for (i = 0; i < VMX_BITMAP_NR; i++) 6788 free_page((unsigned long)vmx_bitmap[i]); 6789 } 6790 } 6791 6792 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6793 { 6794 int i; 6795 6796 if (!cpu_has_vmx_shadow_vmcs()) 6797 enable_shadow_vmcs = 0; 6798 if (enable_shadow_vmcs) { 6799 for (i = 0; i < VMX_BITMAP_NR; i++) { 6800 /* 6801 * The vmx_bitmap is not tied to a VM and so should 6802 * not be charged to a memcg. 6803 */ 6804 vmx_bitmap[i] = (unsigned long *) 6805 __get_free_page(GFP_KERNEL); 6806 if (!vmx_bitmap[i]) { 6807 nested_vmx_hardware_unsetup(); 6808 return -ENOMEM; 6809 } 6810 } 6811 6812 init_vmcs_shadow_fields(); 6813 } 6814 6815 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6816 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6817 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6818 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6819 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6820 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6821 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6822 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; 6823 exit_handlers[EXIT_REASON_VMON] = handle_vmon; 6824 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6825 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6826 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6827 6828 return 0; 6829 } 6830 6831 struct kvm_x86_nested_ops vmx_nested_ops = { 6832 .leave_nested = vmx_leave_nested, 6833 .check_events = vmx_check_nested_events, 6834 .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround, 6835 .hv_timer_pending = nested_vmx_preemption_timer_pending, 6836 .triple_fault = nested_vmx_triple_fault, 6837 .get_state = vmx_get_nested_state, 6838 .set_state = vmx_set_nested_state, 6839 .get_nested_state_pages = vmx_get_nested_state_pages, 6840 .write_log_dirty = nested_vmx_write_pml_buffer, 6841 .enable_evmcs = nested_enable_evmcs, 6842 .get_evmcs_version = nested_get_evmcs_version, 6843 }; 6844