1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/objtool.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "pmu.h" 14 #include "sgx.h" 15 #include "trace.h" 16 #include "vmx.h" 17 #include "x86.h" 18 19 static bool __read_mostly enable_shadow_vmcs = 1; 20 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 21 22 static bool __read_mostly nested_early_check = 0; 23 module_param(nested_early_check, bool, S_IRUGO); 24 25 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 26 27 /* 28 * Hyper-V requires all of these, so mark them as supported even though 29 * they are just treated the same as all-context. 30 */ 31 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 32 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 33 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 34 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 36 37 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 38 39 enum { 40 VMX_VMREAD_BITMAP, 41 VMX_VMWRITE_BITMAP, 42 VMX_BITMAP_NR 43 }; 44 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 45 46 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 47 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 48 49 struct shadow_vmcs_field { 50 u16 encoding; 51 u16 offset; 52 }; 53 static struct shadow_vmcs_field shadow_read_only_fields[] = { 54 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 55 #include "vmcs_shadow_fields.h" 56 }; 57 static int max_shadow_read_only_fields = 58 ARRAY_SIZE(shadow_read_only_fields); 59 60 static struct shadow_vmcs_field shadow_read_write_fields[] = { 61 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 62 #include "vmcs_shadow_fields.h" 63 }; 64 static int max_shadow_read_write_fields = 65 ARRAY_SIZE(shadow_read_write_fields); 66 67 static void init_vmcs_shadow_fields(void) 68 { 69 int i, j; 70 71 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 72 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 73 74 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 75 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 76 u16 field = entry.encoding; 77 78 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 79 (i + 1 == max_shadow_read_only_fields || 80 shadow_read_only_fields[i + 1].encoding != field + 1)) 81 pr_err("Missing field from shadow_read_only_field %x\n", 82 field + 1); 83 84 clear_bit(field, vmx_vmread_bitmap); 85 if (field & 1) 86 #ifdef CONFIG_X86_64 87 continue; 88 #else 89 entry.offset += sizeof(u32); 90 #endif 91 shadow_read_only_fields[j++] = entry; 92 } 93 max_shadow_read_only_fields = j; 94 95 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 96 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 97 u16 field = entry.encoding; 98 99 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 100 (i + 1 == max_shadow_read_write_fields || 101 shadow_read_write_fields[i + 1].encoding != field + 1)) 102 pr_err("Missing field from shadow_read_write_field %x\n", 103 field + 1); 104 105 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 106 field <= GUEST_TR_AR_BYTES, 107 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 108 109 /* 110 * PML and the preemption timer can be emulated, but the 111 * processor cannot vmwrite to fields that don't exist 112 * on bare metal. 113 */ 114 switch (field) { 115 case GUEST_PML_INDEX: 116 if (!cpu_has_vmx_pml()) 117 continue; 118 break; 119 case VMX_PREEMPTION_TIMER_VALUE: 120 if (!cpu_has_vmx_preemption_timer()) 121 continue; 122 break; 123 case GUEST_INTR_STATUS: 124 if (!cpu_has_vmx_apicv()) 125 continue; 126 break; 127 default: 128 break; 129 } 130 131 clear_bit(field, vmx_vmwrite_bitmap); 132 clear_bit(field, vmx_vmread_bitmap); 133 if (field & 1) 134 #ifdef CONFIG_X86_64 135 continue; 136 #else 137 entry.offset += sizeof(u32); 138 #endif 139 shadow_read_write_fields[j++] = entry; 140 } 141 max_shadow_read_write_fields = j; 142 } 143 144 /* 145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 146 * set the success or error code of an emulated VMX instruction (as specified 147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 148 * instruction. 149 */ 150 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 151 { 152 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 153 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 154 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 155 return kvm_skip_emulated_instruction(vcpu); 156 } 157 158 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 162 X86_EFLAGS_SF | X86_EFLAGS_OF)) 163 | X86_EFLAGS_CF); 164 return kvm_skip_emulated_instruction(vcpu); 165 } 166 167 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 168 u32 vm_instruction_error) 169 { 170 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 171 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 172 X86_EFLAGS_SF | X86_EFLAGS_OF)) 173 | X86_EFLAGS_ZF); 174 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 175 /* 176 * We don't need to force sync to shadow VMCS because 177 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 178 * fields and thus must be synced. 179 */ 180 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 181 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 182 183 return kvm_skip_emulated_instruction(vcpu); 184 } 185 186 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 187 { 188 struct vcpu_vmx *vmx = to_vmx(vcpu); 189 190 /* 191 * failValid writes the error number to the current VMCS, which 192 * can't be done if there isn't a current VMCS. 193 */ 194 if (vmx->nested.current_vmptr == INVALID_GPA && 195 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 196 return nested_vmx_failInvalid(vcpu); 197 198 return nested_vmx_failValid(vcpu, vm_instruction_error); 199 } 200 201 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 202 { 203 /* TODO: not to reset guest simply here. */ 204 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 205 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 206 } 207 208 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 209 { 210 return fixed_bits_valid(control, low, high); 211 } 212 213 static inline u64 vmx_control_msr(u32 low, u32 high) 214 { 215 return low | ((u64)high << 32); 216 } 217 218 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 219 { 220 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 221 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 222 vmx->nested.need_vmcs12_to_shadow_sync = false; 223 } 224 225 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 226 { 227 struct vcpu_vmx *vmx = to_vmx(vcpu); 228 229 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 230 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 231 vmx->nested.hv_evmcs = NULL; 232 } 233 234 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 235 } 236 237 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 238 struct loaded_vmcs *prev) 239 { 240 struct vmcs_host_state *dest, *src; 241 242 if (unlikely(!vmx->guest_state_loaded)) 243 return; 244 245 src = &prev->host_state; 246 dest = &vmx->loaded_vmcs->host_state; 247 248 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 249 dest->ldt_sel = src->ldt_sel; 250 #ifdef CONFIG_X86_64 251 dest->ds_sel = src->ds_sel; 252 dest->es_sel = src->es_sel; 253 #endif 254 } 255 256 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 257 { 258 struct vcpu_vmx *vmx = to_vmx(vcpu); 259 struct loaded_vmcs *prev; 260 int cpu; 261 262 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 263 return; 264 265 cpu = get_cpu(); 266 prev = vmx->loaded_vmcs; 267 vmx->loaded_vmcs = vmcs; 268 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 269 vmx_sync_vmcs_host_state(vmx, prev); 270 put_cpu(); 271 272 vmx_register_cache_reset(vcpu); 273 } 274 275 /* 276 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 277 * just stops using VMX. 278 */ 279 static void free_nested(struct kvm_vcpu *vcpu) 280 { 281 struct vcpu_vmx *vmx = to_vmx(vcpu); 282 283 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 284 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 285 286 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 287 return; 288 289 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 290 291 vmx->nested.vmxon = false; 292 vmx->nested.smm.vmxon = false; 293 vmx->nested.vmxon_ptr = INVALID_GPA; 294 free_vpid(vmx->nested.vpid02); 295 vmx->nested.posted_intr_nv = -1; 296 vmx->nested.current_vmptr = INVALID_GPA; 297 if (enable_shadow_vmcs) { 298 vmx_disable_shadow_vmcs(vmx); 299 vmcs_clear(vmx->vmcs01.shadow_vmcs); 300 free_vmcs(vmx->vmcs01.shadow_vmcs); 301 vmx->vmcs01.shadow_vmcs = NULL; 302 } 303 kfree(vmx->nested.cached_vmcs12); 304 vmx->nested.cached_vmcs12 = NULL; 305 kfree(vmx->nested.cached_shadow_vmcs12); 306 vmx->nested.cached_shadow_vmcs12 = NULL; 307 /* Unpin physical memory we referred to in the vmcs02 */ 308 if (vmx->nested.apic_access_page) { 309 kvm_release_page_clean(vmx->nested.apic_access_page); 310 vmx->nested.apic_access_page = NULL; 311 } 312 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 313 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 314 vmx->nested.pi_desc = NULL; 315 316 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 317 318 nested_release_evmcs(vcpu); 319 320 free_loaded_vmcs(&vmx->nested.vmcs02); 321 } 322 323 /* 324 * Ensure that the current vmcs of the logical processor is the 325 * vmcs01 of the vcpu before calling free_nested(). 326 */ 327 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 328 { 329 vcpu_load(vcpu); 330 vmx_leave_nested(vcpu); 331 vcpu_put(vcpu); 332 } 333 334 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 335 336 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 337 { 338 return VALID_PAGE(root_hpa) && 339 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 340 } 341 342 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 343 gpa_t addr) 344 { 345 uint i; 346 struct kvm_mmu_root_info *cached_root; 347 348 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 349 350 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 351 cached_root = &vcpu->arch.mmu->prev_roots[i]; 352 353 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 354 eptp)) 355 vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); 356 } 357 } 358 359 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 360 struct x86_exception *fault) 361 { 362 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 363 struct vcpu_vmx *vmx = to_vmx(vcpu); 364 u32 vm_exit_reason; 365 unsigned long exit_qualification = vcpu->arch.exit_qualification; 366 367 if (vmx->nested.pml_full) { 368 vm_exit_reason = EXIT_REASON_PML_FULL; 369 vmx->nested.pml_full = false; 370 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 371 } else { 372 if (fault->error_code & PFERR_RSVD_MASK) 373 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 374 else 375 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 376 377 /* 378 * Although the caller (kvm_inject_emulated_page_fault) would 379 * have already synced the faulting address in the shadow EPT 380 * tables for the current EPTP12, we also need to sync it for 381 * any other cached EPTP02s based on the same EP4TA, since the 382 * TLB associates mappings to the EP4TA rather than the full EPTP. 383 */ 384 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 385 fault->address); 386 } 387 388 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 389 vmcs12->guest_physical_address = fault->address; 390 } 391 392 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 393 { 394 kvm_init_shadow_ept_mmu(vcpu, 395 to_vmx(vcpu)->nested.msrs.ept_caps & 396 VMX_EPT_EXECUTE_ONLY_BIT, 397 nested_ept_ad_enabled(vcpu), 398 nested_ept_get_eptp(vcpu)); 399 } 400 401 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 402 { 403 WARN_ON(mmu_is_nested(vcpu)); 404 405 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 406 nested_ept_new_eptp(vcpu); 407 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 408 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 409 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 410 411 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 412 } 413 414 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 415 { 416 vcpu->arch.mmu = &vcpu->arch.root_mmu; 417 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 418 } 419 420 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 421 u16 error_code) 422 { 423 bool inequality, bit; 424 425 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 426 inequality = 427 (error_code & vmcs12->page_fault_error_code_mask) != 428 vmcs12->page_fault_error_code_match; 429 return inequality ^ bit; 430 } 431 432 433 /* 434 * KVM wants to inject page-faults which it got to the guest. This function 435 * checks whether in a nested guest, we need to inject them to L1 or L2. 436 */ 437 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 438 { 439 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 440 unsigned int nr = vcpu->arch.exception.nr; 441 bool has_payload = vcpu->arch.exception.has_payload; 442 unsigned long payload = vcpu->arch.exception.payload; 443 444 if (nr == PF_VECTOR) { 445 if (vcpu->arch.exception.nested_apf) { 446 *exit_qual = vcpu->arch.apf.nested_apf_token; 447 return 1; 448 } 449 if (nested_vmx_is_page_fault_vmexit(vmcs12, 450 vcpu->arch.exception.error_code)) { 451 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 452 return 1; 453 } 454 } else if (vmcs12->exception_bitmap & (1u << nr)) { 455 if (nr == DB_VECTOR) { 456 if (!has_payload) { 457 payload = vcpu->arch.dr6; 458 payload &= ~DR6_BT; 459 payload ^= DR6_ACTIVE_LOW; 460 } 461 *exit_qual = payload; 462 } else 463 *exit_qual = 0; 464 return 1; 465 } 466 467 return 0; 468 } 469 470 471 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 472 struct x86_exception *fault) 473 { 474 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 475 476 WARN_ON(!is_guest_mode(vcpu)); 477 478 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 479 !to_vmx(vcpu)->nested.nested_run_pending) { 480 vmcs12->vm_exit_intr_error_code = fault->error_code; 481 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 482 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 483 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 484 fault->address); 485 } else { 486 kvm_inject_page_fault(vcpu, fault); 487 } 488 } 489 490 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 491 struct vmcs12 *vmcs12) 492 { 493 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 494 return 0; 495 496 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 497 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 498 return -EINVAL; 499 500 return 0; 501 } 502 503 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 504 struct vmcs12 *vmcs12) 505 { 506 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 507 return 0; 508 509 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 510 return -EINVAL; 511 512 return 0; 513 } 514 515 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 516 struct vmcs12 *vmcs12) 517 { 518 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 519 return 0; 520 521 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 522 return -EINVAL; 523 524 return 0; 525 } 526 527 /* 528 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 529 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 530 * only the "disable intercept" case needs to be handled. 531 */ 532 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 533 unsigned long *msr_bitmap_l0, 534 u32 msr, int type) 535 { 536 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 537 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 538 539 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 540 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 541 } 542 543 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 544 { 545 int msr; 546 547 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 548 unsigned word = msr / BITS_PER_LONG; 549 550 msr_bitmap[word] = ~0; 551 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 552 } 553 } 554 555 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 556 static inline \ 557 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 558 unsigned long *msr_bitmap_l1, \ 559 unsigned long *msr_bitmap_l0, u32 msr) \ 560 { \ 561 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 562 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 563 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 564 else \ 565 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 566 } 567 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 568 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 569 570 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 571 unsigned long *msr_bitmap_l1, 572 unsigned long *msr_bitmap_l0, 573 u32 msr, int types) 574 { 575 if (types & MSR_TYPE_R) 576 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 577 msr_bitmap_l0, msr); 578 if (types & MSR_TYPE_W) 579 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 580 msr_bitmap_l0, msr); 581 } 582 583 /* 584 * Merge L0's and L1's MSR bitmap, return false to indicate that 585 * we do not use the hardware. 586 */ 587 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 588 struct vmcs12 *vmcs12) 589 { 590 struct vcpu_vmx *vmx = to_vmx(vcpu); 591 int msr; 592 unsigned long *msr_bitmap_l1; 593 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 594 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 595 596 /* Nothing to do if the MSR bitmap is not in use. */ 597 if (!cpu_has_vmx_msr_bitmap() || 598 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 599 return false; 600 601 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 602 return false; 603 604 msr_bitmap_l1 = (unsigned long *)map->hva; 605 606 /* 607 * To keep the control flow simple, pay eight 8-byte writes (sixteen 608 * 4-byte writes on 32-bit systems) up front to enable intercepts for 609 * the x2APIC MSR range and selectively toggle those relevant to L2. 610 */ 611 enable_x2apic_msr_intercepts(msr_bitmap_l0); 612 613 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 614 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 615 /* 616 * L0 need not intercept reads for MSRs between 0x800 617 * and 0x8ff, it just lets the processor take the value 618 * from the virtual-APIC page; take those 256 bits 619 * directly from the L1 bitmap. 620 */ 621 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 622 unsigned word = msr / BITS_PER_LONG; 623 624 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 625 } 626 } 627 628 nested_vmx_disable_intercept_for_x2apic_msr( 629 msr_bitmap_l1, msr_bitmap_l0, 630 X2APIC_MSR(APIC_TASKPRI), 631 MSR_TYPE_R | MSR_TYPE_W); 632 633 if (nested_cpu_has_vid(vmcs12)) { 634 nested_vmx_disable_intercept_for_x2apic_msr( 635 msr_bitmap_l1, msr_bitmap_l0, 636 X2APIC_MSR(APIC_EOI), 637 MSR_TYPE_W); 638 nested_vmx_disable_intercept_for_x2apic_msr( 639 msr_bitmap_l1, msr_bitmap_l0, 640 X2APIC_MSR(APIC_SELF_IPI), 641 MSR_TYPE_W); 642 } 643 } 644 645 /* 646 * Always check vmcs01's bitmap to honor userspace MSR filters and any 647 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 648 */ 649 #ifdef CONFIG_X86_64 650 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 651 MSR_FS_BASE, MSR_TYPE_RW); 652 653 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 654 MSR_GS_BASE, MSR_TYPE_RW); 655 656 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 657 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 658 #endif 659 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 660 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 661 662 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 663 MSR_IA32_PRED_CMD, MSR_TYPE_W); 664 665 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 666 667 return true; 668 } 669 670 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 671 struct vmcs12 *vmcs12) 672 { 673 struct vcpu_vmx *vmx = to_vmx(vcpu); 674 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 675 676 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 677 vmcs12->vmcs_link_pointer == INVALID_GPA) 678 return; 679 680 if (ghc->gpa != vmcs12->vmcs_link_pointer && 681 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 682 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 683 return; 684 685 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 686 VMCS12_SIZE); 687 } 688 689 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 690 struct vmcs12 *vmcs12) 691 { 692 struct vcpu_vmx *vmx = to_vmx(vcpu); 693 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 694 695 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 696 vmcs12->vmcs_link_pointer == INVALID_GPA) 697 return; 698 699 if (ghc->gpa != vmcs12->vmcs_link_pointer && 700 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 701 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 702 return; 703 704 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 705 VMCS12_SIZE); 706 } 707 708 /* 709 * In nested virtualization, check if L1 has set 710 * VM_EXIT_ACK_INTR_ON_EXIT 711 */ 712 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 713 { 714 return get_vmcs12(vcpu)->vm_exit_controls & 715 VM_EXIT_ACK_INTR_ON_EXIT; 716 } 717 718 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 719 struct vmcs12 *vmcs12) 720 { 721 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 722 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 723 return -EINVAL; 724 else 725 return 0; 726 } 727 728 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 729 struct vmcs12 *vmcs12) 730 { 731 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 732 !nested_cpu_has_apic_reg_virt(vmcs12) && 733 !nested_cpu_has_vid(vmcs12) && 734 !nested_cpu_has_posted_intr(vmcs12)) 735 return 0; 736 737 /* 738 * If virtualize x2apic mode is enabled, 739 * virtualize apic access must be disabled. 740 */ 741 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 742 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 743 return -EINVAL; 744 745 /* 746 * If virtual interrupt delivery is enabled, 747 * we must exit on external interrupts. 748 */ 749 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 750 return -EINVAL; 751 752 /* 753 * bits 15:8 should be zero in posted_intr_nv, 754 * the descriptor address has been already checked 755 * in nested_get_vmcs12_pages. 756 * 757 * bits 5:0 of posted_intr_desc_addr should be zero. 758 */ 759 if (nested_cpu_has_posted_intr(vmcs12) && 760 (CC(!nested_cpu_has_vid(vmcs12)) || 761 CC(!nested_exit_intr_ack_set(vcpu)) || 762 CC((vmcs12->posted_intr_nv & 0xff00)) || 763 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 764 return -EINVAL; 765 766 /* tpr shadow is needed by all apicv features. */ 767 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 768 return -EINVAL; 769 770 return 0; 771 } 772 773 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 774 u32 count, u64 addr) 775 { 776 if (count == 0) 777 return 0; 778 779 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 780 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 781 return -EINVAL; 782 783 return 0; 784 } 785 786 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 787 struct vmcs12 *vmcs12) 788 { 789 if (CC(nested_vmx_check_msr_switch(vcpu, 790 vmcs12->vm_exit_msr_load_count, 791 vmcs12->vm_exit_msr_load_addr)) || 792 CC(nested_vmx_check_msr_switch(vcpu, 793 vmcs12->vm_exit_msr_store_count, 794 vmcs12->vm_exit_msr_store_addr))) 795 return -EINVAL; 796 797 return 0; 798 } 799 800 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 801 struct vmcs12 *vmcs12) 802 { 803 if (CC(nested_vmx_check_msr_switch(vcpu, 804 vmcs12->vm_entry_msr_load_count, 805 vmcs12->vm_entry_msr_load_addr))) 806 return -EINVAL; 807 808 return 0; 809 } 810 811 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 812 struct vmcs12 *vmcs12) 813 { 814 if (!nested_cpu_has_pml(vmcs12)) 815 return 0; 816 817 if (CC(!nested_cpu_has_ept(vmcs12)) || 818 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 819 return -EINVAL; 820 821 return 0; 822 } 823 824 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 825 struct vmcs12 *vmcs12) 826 { 827 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 828 !nested_cpu_has_ept(vmcs12))) 829 return -EINVAL; 830 return 0; 831 } 832 833 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 834 struct vmcs12 *vmcs12) 835 { 836 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 837 !nested_cpu_has_ept(vmcs12))) 838 return -EINVAL; 839 return 0; 840 } 841 842 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 843 struct vmcs12 *vmcs12) 844 { 845 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 846 return 0; 847 848 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 849 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 850 return -EINVAL; 851 852 return 0; 853 } 854 855 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 856 struct vmx_msr_entry *e) 857 { 858 /* x2APIC MSR accesses are not allowed */ 859 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 860 return -EINVAL; 861 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 862 CC(e->index == MSR_IA32_UCODE_REV)) 863 return -EINVAL; 864 if (CC(e->reserved != 0)) 865 return -EINVAL; 866 return 0; 867 } 868 869 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 870 struct vmx_msr_entry *e) 871 { 872 if (CC(e->index == MSR_FS_BASE) || 873 CC(e->index == MSR_GS_BASE) || 874 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 875 nested_vmx_msr_check_common(vcpu, e)) 876 return -EINVAL; 877 return 0; 878 } 879 880 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 881 struct vmx_msr_entry *e) 882 { 883 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 884 nested_vmx_msr_check_common(vcpu, e)) 885 return -EINVAL; 886 return 0; 887 } 888 889 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 890 { 891 struct vcpu_vmx *vmx = to_vmx(vcpu); 892 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 893 vmx->nested.msrs.misc_high); 894 895 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 896 } 897 898 /* 899 * Load guest's/host's msr at nested entry/exit. 900 * return 0 for success, entry index for failure. 901 * 902 * One of the failure modes for MSR load/store is when a list exceeds the 903 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 904 * as possible, process all valid entries before failing rather than precheck 905 * for a capacity violation. 906 */ 907 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 908 { 909 u32 i; 910 struct vmx_msr_entry e; 911 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 912 913 for (i = 0; i < count; i++) { 914 if (unlikely(i >= max_msr_list_size)) 915 goto fail; 916 917 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 918 &e, sizeof(e))) { 919 pr_debug_ratelimited( 920 "%s cannot read MSR entry (%u, 0x%08llx)\n", 921 __func__, i, gpa + i * sizeof(e)); 922 goto fail; 923 } 924 if (nested_vmx_load_msr_check(vcpu, &e)) { 925 pr_debug_ratelimited( 926 "%s check failed (%u, 0x%x, 0x%x)\n", 927 __func__, i, e.index, e.reserved); 928 goto fail; 929 } 930 if (kvm_set_msr(vcpu, e.index, e.value)) { 931 pr_debug_ratelimited( 932 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 933 __func__, i, e.index, e.value); 934 goto fail; 935 } 936 } 937 return 0; 938 fail: 939 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 940 return i + 1; 941 } 942 943 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 944 u32 msr_index, 945 u64 *data) 946 { 947 struct vcpu_vmx *vmx = to_vmx(vcpu); 948 949 /* 950 * If the L0 hypervisor stored a more accurate value for the TSC that 951 * does not include the time taken for emulation of the L2->L1 952 * VM-exit in L0, use the more accurate value. 953 */ 954 if (msr_index == MSR_IA32_TSC) { 955 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 956 MSR_IA32_TSC); 957 958 if (i >= 0) { 959 u64 val = vmx->msr_autostore.guest.val[i].value; 960 961 *data = kvm_read_l1_tsc(vcpu, val); 962 return true; 963 } 964 } 965 966 if (kvm_get_msr(vcpu, msr_index, data)) { 967 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 968 msr_index); 969 return false; 970 } 971 return true; 972 } 973 974 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 975 struct vmx_msr_entry *e) 976 { 977 if (kvm_vcpu_read_guest(vcpu, 978 gpa + i * sizeof(*e), 979 e, 2 * sizeof(u32))) { 980 pr_debug_ratelimited( 981 "%s cannot read MSR entry (%u, 0x%08llx)\n", 982 __func__, i, gpa + i * sizeof(*e)); 983 return false; 984 } 985 if (nested_vmx_store_msr_check(vcpu, e)) { 986 pr_debug_ratelimited( 987 "%s check failed (%u, 0x%x, 0x%x)\n", 988 __func__, i, e->index, e->reserved); 989 return false; 990 } 991 return true; 992 } 993 994 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 995 { 996 u64 data; 997 u32 i; 998 struct vmx_msr_entry e; 999 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1000 1001 for (i = 0; i < count; i++) { 1002 if (unlikely(i >= max_msr_list_size)) 1003 return -EINVAL; 1004 1005 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1006 return -EINVAL; 1007 1008 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1009 return -EINVAL; 1010 1011 if (kvm_vcpu_write_guest(vcpu, 1012 gpa + i * sizeof(e) + 1013 offsetof(struct vmx_msr_entry, value), 1014 &data, sizeof(data))) { 1015 pr_debug_ratelimited( 1016 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1017 __func__, i, e.index, data); 1018 return -EINVAL; 1019 } 1020 } 1021 return 0; 1022 } 1023 1024 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1025 { 1026 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1027 u32 count = vmcs12->vm_exit_msr_store_count; 1028 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1029 struct vmx_msr_entry e; 1030 u32 i; 1031 1032 for (i = 0; i < count; i++) { 1033 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1034 return false; 1035 1036 if (e.index == msr_index) 1037 return true; 1038 } 1039 return false; 1040 } 1041 1042 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1043 u32 msr_index) 1044 { 1045 struct vcpu_vmx *vmx = to_vmx(vcpu); 1046 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1047 bool in_vmcs12_store_list; 1048 int msr_autostore_slot; 1049 bool in_autostore_list; 1050 int last; 1051 1052 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1053 in_autostore_list = msr_autostore_slot >= 0; 1054 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1055 1056 if (in_vmcs12_store_list && !in_autostore_list) { 1057 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1058 /* 1059 * Emulated VMEntry does not fail here. Instead a less 1060 * accurate value will be returned by 1061 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1062 * instead of reading the value from the vmcs02 VMExit 1063 * MSR-store area. 1064 */ 1065 pr_warn_ratelimited( 1066 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1067 msr_index); 1068 return; 1069 } 1070 last = autostore->nr++; 1071 autostore->val[last].index = msr_index; 1072 } else if (!in_vmcs12_store_list && in_autostore_list) { 1073 last = --autostore->nr; 1074 autostore->val[msr_autostore_slot] = autostore->val[last]; 1075 } 1076 } 1077 1078 /* 1079 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1080 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1081 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1082 * @entry_failure_code. 1083 */ 1084 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1085 bool nested_ept, bool reload_pdptrs, 1086 enum vm_entry_failure_code *entry_failure_code) 1087 { 1088 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1089 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1090 return -EINVAL; 1091 } 1092 1093 /* 1094 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1095 * must not be dereferenced. 1096 */ 1097 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1098 CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1099 *entry_failure_code = ENTRY_FAIL_PDPTE; 1100 return -EINVAL; 1101 } 1102 1103 if (!nested_ept) 1104 kvm_mmu_new_pgd(vcpu, cr3); 1105 1106 vcpu->arch.cr3 = cr3; 1107 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1108 1109 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1110 kvm_init_mmu(vcpu); 1111 1112 return 0; 1113 } 1114 1115 /* 1116 * Returns if KVM is able to config CPU to tag TLB entries 1117 * populated by L2 differently than TLB entries populated 1118 * by L1. 1119 * 1120 * If L0 uses EPT, L1 and L2 run with different EPTP because 1121 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1122 * are tagged with different EPTP. 1123 * 1124 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1125 * with different VPID (L1 entries are tagged with vmx->vpid 1126 * while L2 entries are tagged with vmx->nested.vpid02). 1127 */ 1128 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1129 { 1130 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1131 1132 return enable_ept || 1133 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1134 } 1135 1136 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1137 struct vmcs12 *vmcs12, 1138 bool is_vmenter) 1139 { 1140 struct vcpu_vmx *vmx = to_vmx(vcpu); 1141 1142 /* 1143 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1144 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1145 * full TLB flush from the guest's perspective. This is required even 1146 * if VPID is disabled in the host as KVM may need to synchronize the 1147 * MMU in response to the guest TLB flush. 1148 * 1149 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1150 * EPT is a special snowflake, as guest-physical mappings aren't 1151 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1152 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1153 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1154 * those mappings. 1155 */ 1156 if (!nested_cpu_has_vpid(vmcs12)) { 1157 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1158 return; 1159 } 1160 1161 /* L2 should never have a VPID if VPID is disabled. */ 1162 WARN_ON(!enable_vpid); 1163 1164 /* 1165 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1166 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1167 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1168 * that the new vpid12 has never been used and thus represents a new 1169 * guest ASID that cannot have entries in the TLB. 1170 */ 1171 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1172 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1173 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1174 return; 1175 } 1176 1177 /* 1178 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1179 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1180 * KVM was unable to allocate a VPID for L2, flush the current context 1181 * as the effective ASID is common to both L1 and L2. 1182 */ 1183 if (!nested_has_guest_tlb_tag(vcpu)) 1184 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1185 } 1186 1187 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1188 { 1189 superset &= mask; 1190 subset &= mask; 1191 1192 return (superset | subset) == superset; 1193 } 1194 1195 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1196 { 1197 const u64 feature_and_reserved = 1198 /* feature (except bit 48; see below) */ 1199 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1200 /* reserved */ 1201 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1202 u64 vmx_basic = vmx->nested.msrs.basic; 1203 1204 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1205 return -EINVAL; 1206 1207 /* 1208 * KVM does not emulate a version of VMX that constrains physical 1209 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1210 */ 1211 if (data & BIT_ULL(48)) 1212 return -EINVAL; 1213 1214 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1215 vmx_basic_vmcs_revision_id(data)) 1216 return -EINVAL; 1217 1218 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1219 return -EINVAL; 1220 1221 vmx->nested.msrs.basic = data; 1222 return 0; 1223 } 1224 1225 static int 1226 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1227 { 1228 u64 supported; 1229 u32 *lowp, *highp; 1230 1231 switch (msr_index) { 1232 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1233 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1234 highp = &vmx->nested.msrs.pinbased_ctls_high; 1235 break; 1236 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1237 lowp = &vmx->nested.msrs.procbased_ctls_low; 1238 highp = &vmx->nested.msrs.procbased_ctls_high; 1239 break; 1240 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1241 lowp = &vmx->nested.msrs.exit_ctls_low; 1242 highp = &vmx->nested.msrs.exit_ctls_high; 1243 break; 1244 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1245 lowp = &vmx->nested.msrs.entry_ctls_low; 1246 highp = &vmx->nested.msrs.entry_ctls_high; 1247 break; 1248 case MSR_IA32_VMX_PROCBASED_CTLS2: 1249 lowp = &vmx->nested.msrs.secondary_ctls_low; 1250 highp = &vmx->nested.msrs.secondary_ctls_high; 1251 break; 1252 default: 1253 BUG(); 1254 } 1255 1256 supported = vmx_control_msr(*lowp, *highp); 1257 1258 /* Check must-be-1 bits are still 1. */ 1259 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1260 return -EINVAL; 1261 1262 /* Check must-be-0 bits are still 0. */ 1263 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1264 return -EINVAL; 1265 1266 *lowp = data; 1267 *highp = data >> 32; 1268 return 0; 1269 } 1270 1271 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1272 { 1273 const u64 feature_and_reserved_bits = 1274 /* feature */ 1275 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1276 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1277 /* reserved */ 1278 GENMASK_ULL(13, 9) | BIT_ULL(31); 1279 u64 vmx_misc; 1280 1281 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1282 vmx->nested.msrs.misc_high); 1283 1284 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1285 return -EINVAL; 1286 1287 if ((vmx->nested.msrs.pinbased_ctls_high & 1288 PIN_BASED_VMX_PREEMPTION_TIMER) && 1289 vmx_misc_preemption_timer_rate(data) != 1290 vmx_misc_preemption_timer_rate(vmx_misc)) 1291 return -EINVAL; 1292 1293 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1294 return -EINVAL; 1295 1296 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1297 return -EINVAL; 1298 1299 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1300 return -EINVAL; 1301 1302 vmx->nested.msrs.misc_low = data; 1303 vmx->nested.msrs.misc_high = data >> 32; 1304 1305 return 0; 1306 } 1307 1308 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1309 { 1310 u64 vmx_ept_vpid_cap; 1311 1312 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1313 vmx->nested.msrs.vpid_caps); 1314 1315 /* Every bit is either reserved or a feature bit. */ 1316 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1317 return -EINVAL; 1318 1319 vmx->nested.msrs.ept_caps = data; 1320 vmx->nested.msrs.vpid_caps = data >> 32; 1321 return 0; 1322 } 1323 1324 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1325 { 1326 u64 *msr; 1327 1328 switch (msr_index) { 1329 case MSR_IA32_VMX_CR0_FIXED0: 1330 msr = &vmx->nested.msrs.cr0_fixed0; 1331 break; 1332 case MSR_IA32_VMX_CR4_FIXED0: 1333 msr = &vmx->nested.msrs.cr4_fixed0; 1334 break; 1335 default: 1336 BUG(); 1337 } 1338 1339 /* 1340 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1341 * must be 1 in the restored value. 1342 */ 1343 if (!is_bitwise_subset(data, *msr, -1ULL)) 1344 return -EINVAL; 1345 1346 *msr = data; 1347 return 0; 1348 } 1349 1350 /* 1351 * Called when userspace is restoring VMX MSRs. 1352 * 1353 * Returns 0 on success, non-0 otherwise. 1354 */ 1355 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1356 { 1357 struct vcpu_vmx *vmx = to_vmx(vcpu); 1358 1359 /* 1360 * Don't allow changes to the VMX capability MSRs while the vCPU 1361 * is in VMX operation. 1362 */ 1363 if (vmx->nested.vmxon) 1364 return -EBUSY; 1365 1366 switch (msr_index) { 1367 case MSR_IA32_VMX_BASIC: 1368 return vmx_restore_vmx_basic(vmx, data); 1369 case MSR_IA32_VMX_PINBASED_CTLS: 1370 case MSR_IA32_VMX_PROCBASED_CTLS: 1371 case MSR_IA32_VMX_EXIT_CTLS: 1372 case MSR_IA32_VMX_ENTRY_CTLS: 1373 /* 1374 * The "non-true" VMX capability MSRs are generated from the 1375 * "true" MSRs, so we do not support restoring them directly. 1376 * 1377 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1378 * should restore the "true" MSRs with the must-be-1 bits 1379 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1380 * DEFAULT SETTINGS". 1381 */ 1382 return -EINVAL; 1383 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1384 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1385 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1386 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1387 case MSR_IA32_VMX_PROCBASED_CTLS2: 1388 return vmx_restore_control_msr(vmx, msr_index, data); 1389 case MSR_IA32_VMX_MISC: 1390 return vmx_restore_vmx_misc(vmx, data); 1391 case MSR_IA32_VMX_CR0_FIXED0: 1392 case MSR_IA32_VMX_CR4_FIXED0: 1393 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1394 case MSR_IA32_VMX_CR0_FIXED1: 1395 case MSR_IA32_VMX_CR4_FIXED1: 1396 /* 1397 * These MSRs are generated based on the vCPU's CPUID, so we 1398 * do not support restoring them directly. 1399 */ 1400 return -EINVAL; 1401 case MSR_IA32_VMX_EPT_VPID_CAP: 1402 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1403 case MSR_IA32_VMX_VMCS_ENUM: 1404 vmx->nested.msrs.vmcs_enum = data; 1405 return 0; 1406 case MSR_IA32_VMX_VMFUNC: 1407 if (data & ~vmx->nested.msrs.vmfunc_controls) 1408 return -EINVAL; 1409 vmx->nested.msrs.vmfunc_controls = data; 1410 return 0; 1411 default: 1412 /* 1413 * The rest of the VMX capability MSRs do not support restore. 1414 */ 1415 return -EINVAL; 1416 } 1417 } 1418 1419 /* Returns 0 on success, non-0 otherwise. */ 1420 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1421 { 1422 switch (msr_index) { 1423 case MSR_IA32_VMX_BASIC: 1424 *pdata = msrs->basic; 1425 break; 1426 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1427 case MSR_IA32_VMX_PINBASED_CTLS: 1428 *pdata = vmx_control_msr( 1429 msrs->pinbased_ctls_low, 1430 msrs->pinbased_ctls_high); 1431 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1432 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1433 break; 1434 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1435 case MSR_IA32_VMX_PROCBASED_CTLS: 1436 *pdata = vmx_control_msr( 1437 msrs->procbased_ctls_low, 1438 msrs->procbased_ctls_high); 1439 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1440 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1441 break; 1442 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1443 case MSR_IA32_VMX_EXIT_CTLS: 1444 *pdata = vmx_control_msr( 1445 msrs->exit_ctls_low, 1446 msrs->exit_ctls_high); 1447 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1448 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1449 break; 1450 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1451 case MSR_IA32_VMX_ENTRY_CTLS: 1452 *pdata = vmx_control_msr( 1453 msrs->entry_ctls_low, 1454 msrs->entry_ctls_high); 1455 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1456 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1457 break; 1458 case MSR_IA32_VMX_MISC: 1459 *pdata = vmx_control_msr( 1460 msrs->misc_low, 1461 msrs->misc_high); 1462 break; 1463 case MSR_IA32_VMX_CR0_FIXED0: 1464 *pdata = msrs->cr0_fixed0; 1465 break; 1466 case MSR_IA32_VMX_CR0_FIXED1: 1467 *pdata = msrs->cr0_fixed1; 1468 break; 1469 case MSR_IA32_VMX_CR4_FIXED0: 1470 *pdata = msrs->cr4_fixed0; 1471 break; 1472 case MSR_IA32_VMX_CR4_FIXED1: 1473 *pdata = msrs->cr4_fixed1; 1474 break; 1475 case MSR_IA32_VMX_VMCS_ENUM: 1476 *pdata = msrs->vmcs_enum; 1477 break; 1478 case MSR_IA32_VMX_PROCBASED_CTLS2: 1479 *pdata = vmx_control_msr( 1480 msrs->secondary_ctls_low, 1481 msrs->secondary_ctls_high); 1482 break; 1483 case MSR_IA32_VMX_EPT_VPID_CAP: 1484 *pdata = msrs->ept_caps | 1485 ((u64)msrs->vpid_caps << 32); 1486 break; 1487 case MSR_IA32_VMX_VMFUNC: 1488 *pdata = msrs->vmfunc_controls; 1489 break; 1490 default: 1491 return 1; 1492 } 1493 1494 return 0; 1495 } 1496 1497 /* 1498 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1499 * been modified by the L1 guest. Note, "writable" in this context means 1500 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1501 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1502 * VM-exit information fields (which are actually writable if the vCPU is 1503 * configured to support "VMWRITE to any supported field in the VMCS"). 1504 */ 1505 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1506 { 1507 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1508 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1509 struct shadow_vmcs_field field; 1510 unsigned long val; 1511 int i; 1512 1513 if (WARN_ON(!shadow_vmcs)) 1514 return; 1515 1516 preempt_disable(); 1517 1518 vmcs_load(shadow_vmcs); 1519 1520 for (i = 0; i < max_shadow_read_write_fields; i++) { 1521 field = shadow_read_write_fields[i]; 1522 val = __vmcs_readl(field.encoding); 1523 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1524 } 1525 1526 vmcs_clear(shadow_vmcs); 1527 vmcs_load(vmx->loaded_vmcs->vmcs); 1528 1529 preempt_enable(); 1530 } 1531 1532 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1533 { 1534 const struct shadow_vmcs_field *fields[] = { 1535 shadow_read_write_fields, 1536 shadow_read_only_fields 1537 }; 1538 const int max_fields[] = { 1539 max_shadow_read_write_fields, 1540 max_shadow_read_only_fields 1541 }; 1542 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1543 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1544 struct shadow_vmcs_field field; 1545 unsigned long val; 1546 int i, q; 1547 1548 if (WARN_ON(!shadow_vmcs)) 1549 return; 1550 1551 vmcs_load(shadow_vmcs); 1552 1553 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1554 for (i = 0; i < max_fields[q]; i++) { 1555 field = fields[q][i]; 1556 val = vmcs12_read_any(vmcs12, field.encoding, 1557 field.offset); 1558 __vmcs_writel(field.encoding, val); 1559 } 1560 } 1561 1562 vmcs_clear(shadow_vmcs); 1563 vmcs_load(vmx->loaded_vmcs->vmcs); 1564 } 1565 1566 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1567 { 1568 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1569 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1570 1571 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1572 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1573 vmcs12->guest_rip = evmcs->guest_rip; 1574 1575 if (unlikely(!(hv_clean_fields & 1576 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1577 vmcs12->guest_rsp = evmcs->guest_rsp; 1578 vmcs12->guest_rflags = evmcs->guest_rflags; 1579 vmcs12->guest_interruptibility_info = 1580 evmcs->guest_interruptibility_info; 1581 } 1582 1583 if (unlikely(!(hv_clean_fields & 1584 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1585 vmcs12->cpu_based_vm_exec_control = 1586 evmcs->cpu_based_vm_exec_control; 1587 } 1588 1589 if (unlikely(!(hv_clean_fields & 1590 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1591 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1592 } 1593 1594 if (unlikely(!(hv_clean_fields & 1595 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1596 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1597 } 1598 1599 if (unlikely(!(hv_clean_fields & 1600 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1601 vmcs12->vm_entry_intr_info_field = 1602 evmcs->vm_entry_intr_info_field; 1603 vmcs12->vm_entry_exception_error_code = 1604 evmcs->vm_entry_exception_error_code; 1605 vmcs12->vm_entry_instruction_len = 1606 evmcs->vm_entry_instruction_len; 1607 } 1608 1609 if (unlikely(!(hv_clean_fields & 1610 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1611 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1612 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1613 vmcs12->host_cr0 = evmcs->host_cr0; 1614 vmcs12->host_cr3 = evmcs->host_cr3; 1615 vmcs12->host_cr4 = evmcs->host_cr4; 1616 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1617 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1618 vmcs12->host_rip = evmcs->host_rip; 1619 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1620 vmcs12->host_es_selector = evmcs->host_es_selector; 1621 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1622 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1623 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1624 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1625 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1626 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1627 } 1628 1629 if (unlikely(!(hv_clean_fields & 1630 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1631 vmcs12->pin_based_vm_exec_control = 1632 evmcs->pin_based_vm_exec_control; 1633 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1634 vmcs12->secondary_vm_exec_control = 1635 evmcs->secondary_vm_exec_control; 1636 } 1637 1638 if (unlikely(!(hv_clean_fields & 1639 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1640 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1641 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1642 } 1643 1644 if (unlikely(!(hv_clean_fields & 1645 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1646 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1647 } 1648 1649 if (unlikely(!(hv_clean_fields & 1650 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1651 vmcs12->guest_es_base = evmcs->guest_es_base; 1652 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1653 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1654 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1655 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1656 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1657 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1658 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1659 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1660 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1661 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1662 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1663 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1664 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1665 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1666 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1667 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1668 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1669 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1670 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1671 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1672 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1673 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1674 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1675 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1676 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1677 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1678 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1679 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1680 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1681 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1682 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1683 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1684 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1685 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1686 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1687 } 1688 1689 if (unlikely(!(hv_clean_fields & 1690 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1691 vmcs12->tsc_offset = evmcs->tsc_offset; 1692 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1693 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1694 } 1695 1696 if (unlikely(!(hv_clean_fields & 1697 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1698 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1699 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1700 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1701 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1702 vmcs12->guest_cr0 = evmcs->guest_cr0; 1703 vmcs12->guest_cr3 = evmcs->guest_cr3; 1704 vmcs12->guest_cr4 = evmcs->guest_cr4; 1705 vmcs12->guest_dr7 = evmcs->guest_dr7; 1706 } 1707 1708 if (unlikely(!(hv_clean_fields & 1709 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1710 vmcs12->host_fs_base = evmcs->host_fs_base; 1711 vmcs12->host_gs_base = evmcs->host_gs_base; 1712 vmcs12->host_tr_base = evmcs->host_tr_base; 1713 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1714 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1715 vmcs12->host_rsp = evmcs->host_rsp; 1716 } 1717 1718 if (unlikely(!(hv_clean_fields & 1719 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1720 vmcs12->ept_pointer = evmcs->ept_pointer; 1721 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1722 } 1723 1724 if (unlikely(!(hv_clean_fields & 1725 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1726 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1727 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1728 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1729 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1730 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1731 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1732 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1733 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1734 vmcs12->guest_pending_dbg_exceptions = 1735 evmcs->guest_pending_dbg_exceptions; 1736 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1737 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1738 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1739 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1740 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1741 } 1742 1743 /* 1744 * Not used? 1745 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1746 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1747 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1748 * vmcs12->page_fault_error_code_mask = 1749 * evmcs->page_fault_error_code_mask; 1750 * vmcs12->page_fault_error_code_match = 1751 * evmcs->page_fault_error_code_match; 1752 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1753 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1754 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1755 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1756 */ 1757 1758 /* 1759 * Read only fields: 1760 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1761 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1762 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1763 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1764 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1765 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1766 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1767 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1768 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1769 * vmcs12->exit_qualification = evmcs->exit_qualification; 1770 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1771 * 1772 * Not present in struct vmcs12: 1773 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1774 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1775 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1776 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1777 */ 1778 1779 return; 1780 } 1781 1782 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1783 { 1784 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1785 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1786 1787 /* 1788 * Should not be changed by KVM: 1789 * 1790 * evmcs->host_es_selector = vmcs12->host_es_selector; 1791 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1792 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1793 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1794 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1795 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1796 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1797 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1798 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1799 * evmcs->host_cr0 = vmcs12->host_cr0; 1800 * evmcs->host_cr3 = vmcs12->host_cr3; 1801 * evmcs->host_cr4 = vmcs12->host_cr4; 1802 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1803 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1804 * evmcs->host_rip = vmcs12->host_rip; 1805 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1806 * evmcs->host_fs_base = vmcs12->host_fs_base; 1807 * evmcs->host_gs_base = vmcs12->host_gs_base; 1808 * evmcs->host_tr_base = vmcs12->host_tr_base; 1809 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1810 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1811 * evmcs->host_rsp = vmcs12->host_rsp; 1812 * sync_vmcs02_to_vmcs12() doesn't read these: 1813 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1814 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1815 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1816 * evmcs->ept_pointer = vmcs12->ept_pointer; 1817 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1818 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1819 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1820 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1821 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1822 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1823 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1824 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1825 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1826 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1827 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1828 * evmcs->page_fault_error_code_mask = 1829 * vmcs12->page_fault_error_code_mask; 1830 * evmcs->page_fault_error_code_match = 1831 * vmcs12->page_fault_error_code_match; 1832 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1833 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1834 * evmcs->tsc_offset = vmcs12->tsc_offset; 1835 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1836 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1837 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1838 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1839 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1840 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1841 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1842 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1843 * 1844 * Not present in struct vmcs12: 1845 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1846 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1847 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1848 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1849 */ 1850 1851 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1852 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1853 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1854 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1855 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1856 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1857 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1858 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1859 1860 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1861 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1862 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1863 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1864 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1865 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1866 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1867 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1868 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1869 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1870 1871 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1872 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1873 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1874 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1875 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1876 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1877 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1878 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1879 1880 evmcs->guest_es_base = vmcs12->guest_es_base; 1881 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1882 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1883 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1884 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1885 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1886 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1887 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1888 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1889 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1890 1891 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1892 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1893 1894 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1895 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1896 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1897 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1898 1899 evmcs->guest_pending_dbg_exceptions = 1900 vmcs12->guest_pending_dbg_exceptions; 1901 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1902 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1903 1904 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1905 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1906 1907 evmcs->guest_cr0 = vmcs12->guest_cr0; 1908 evmcs->guest_cr3 = vmcs12->guest_cr3; 1909 evmcs->guest_cr4 = vmcs12->guest_cr4; 1910 evmcs->guest_dr7 = vmcs12->guest_dr7; 1911 1912 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1913 1914 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1915 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1916 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1917 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1918 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1919 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1920 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1921 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1922 1923 evmcs->exit_qualification = vmcs12->exit_qualification; 1924 1925 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1926 evmcs->guest_rsp = vmcs12->guest_rsp; 1927 evmcs->guest_rflags = vmcs12->guest_rflags; 1928 1929 evmcs->guest_interruptibility_info = 1930 vmcs12->guest_interruptibility_info; 1931 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1932 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1933 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1934 evmcs->vm_entry_exception_error_code = 1935 vmcs12->vm_entry_exception_error_code; 1936 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1937 1938 evmcs->guest_rip = vmcs12->guest_rip; 1939 1940 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1941 1942 return; 1943 } 1944 1945 /* 1946 * This is an equivalent of the nested hypervisor executing the vmptrld 1947 * instruction. 1948 */ 1949 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1950 struct kvm_vcpu *vcpu, bool from_launch) 1951 { 1952 struct vcpu_vmx *vmx = to_vmx(vcpu); 1953 bool evmcs_gpa_changed = false; 1954 u64 evmcs_gpa; 1955 1956 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1957 return EVMPTRLD_DISABLED; 1958 1959 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { 1960 nested_release_evmcs(vcpu); 1961 return EVMPTRLD_DISABLED; 1962 } 1963 1964 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1965 vmx->nested.current_vmptr = INVALID_GPA; 1966 1967 nested_release_evmcs(vcpu); 1968 1969 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1970 &vmx->nested.hv_evmcs_map)) 1971 return EVMPTRLD_ERROR; 1972 1973 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1974 1975 /* 1976 * Currently, KVM only supports eVMCS version 1 1977 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 1978 * value to first u32 field of eVMCS which should specify eVMCS 1979 * VersionNumber. 1980 * 1981 * Guest should be aware of supported eVMCS versions by host by 1982 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 1983 * expected to set this CPUID leaf according to the value 1984 * returned in vmcs_version from nested_enable_evmcs(). 1985 * 1986 * However, it turns out that Microsoft Hyper-V fails to comply 1987 * to their own invented interface: When Hyper-V use eVMCS, it 1988 * just sets first u32 field of eVMCS to revision_id specified 1989 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 1990 * which is one of the supported versions specified in 1991 * CPUID.0x4000000A.EAX[0:15]. 1992 * 1993 * To overcome Hyper-V bug, we accept here either a supported 1994 * eVMCS version or VMCS12 revision_id as valid values for first 1995 * u32 field of eVMCS. 1996 */ 1997 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 1998 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 1999 nested_release_evmcs(vcpu); 2000 return EVMPTRLD_VMFAIL; 2001 } 2002 2003 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2004 2005 evmcs_gpa_changed = true; 2006 /* 2007 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2008 * reloaded from guest's memory (read only fields, fields not 2009 * present in struct hv_enlightened_vmcs, ...). Make sure there 2010 * are no leftovers. 2011 */ 2012 if (from_launch) { 2013 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2014 memset(vmcs12, 0, sizeof(*vmcs12)); 2015 vmcs12->hdr.revision_id = VMCS12_REVISION; 2016 } 2017 2018 } 2019 2020 /* 2021 * Clean fields data can't be used on VMLAUNCH and when we switch 2022 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2023 */ 2024 if (from_launch || evmcs_gpa_changed) 2025 vmx->nested.hv_evmcs->hv_clean_fields &= 2026 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2027 2028 return EVMPTRLD_SUCCEEDED; 2029 } 2030 2031 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2032 { 2033 struct vcpu_vmx *vmx = to_vmx(vcpu); 2034 2035 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2036 copy_vmcs12_to_enlightened(vmx); 2037 else 2038 copy_vmcs12_to_shadow(vmx); 2039 2040 vmx->nested.need_vmcs12_to_shadow_sync = false; 2041 } 2042 2043 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2044 { 2045 struct vcpu_vmx *vmx = 2046 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2047 2048 vmx->nested.preemption_timer_expired = true; 2049 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2050 kvm_vcpu_kick(&vmx->vcpu); 2051 2052 return HRTIMER_NORESTART; 2053 } 2054 2055 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2056 { 2057 struct vcpu_vmx *vmx = to_vmx(vcpu); 2058 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2059 2060 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2061 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2062 2063 if (!vmx->nested.has_preemption_timer_deadline) { 2064 vmx->nested.preemption_timer_deadline = 2065 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2066 vmx->nested.has_preemption_timer_deadline = true; 2067 } 2068 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2069 } 2070 2071 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2072 u64 preemption_timeout) 2073 { 2074 struct vcpu_vmx *vmx = to_vmx(vcpu); 2075 2076 /* 2077 * A timer value of zero is architecturally guaranteed to cause 2078 * a VMExit prior to executing any instructions in the guest. 2079 */ 2080 if (preemption_timeout == 0) { 2081 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2082 return; 2083 } 2084 2085 if (vcpu->arch.virtual_tsc_khz == 0) 2086 return; 2087 2088 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2089 preemption_timeout *= 1000000; 2090 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2091 hrtimer_start(&vmx->nested.preemption_timer, 2092 ktime_add_ns(ktime_get(), preemption_timeout), 2093 HRTIMER_MODE_ABS_PINNED); 2094 } 2095 2096 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2097 { 2098 if (vmx->nested.nested_run_pending && 2099 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2100 return vmcs12->guest_ia32_efer; 2101 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2102 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2103 else 2104 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2105 } 2106 2107 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2108 { 2109 /* 2110 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2111 * according to L0's settings (vmcs12 is irrelevant here). Host 2112 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2113 * will be set as needed prior to VMLAUNCH/VMRESUME. 2114 */ 2115 if (vmx->nested.vmcs02_initialized) 2116 return; 2117 vmx->nested.vmcs02_initialized = true; 2118 2119 /* 2120 * We don't care what the EPTP value is we just need to guarantee 2121 * it's valid so we don't get a false positive when doing early 2122 * consistency checks. 2123 */ 2124 if (enable_ept && nested_early_check) 2125 vmcs_write64(EPT_POINTER, 2126 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2127 2128 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2129 if (cpu_has_vmx_vmfunc()) 2130 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2131 2132 if (cpu_has_vmx_posted_intr()) 2133 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2134 2135 if (cpu_has_vmx_msr_bitmap()) 2136 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2137 2138 /* 2139 * PML is emulated for L2, but never enabled in hardware as the MMU 2140 * handles A/D emulation. Disabling PML for L2 also avoids having to 2141 * deal with filtering out L2 GPAs from the buffer. 2142 */ 2143 if (enable_pml) { 2144 vmcs_write64(PML_ADDRESS, 0); 2145 vmcs_write16(GUEST_PML_INDEX, -1); 2146 } 2147 2148 if (cpu_has_vmx_encls_vmexit()) 2149 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2150 2151 /* 2152 * Set the MSR load/store lists to match L0's settings. Only the 2153 * addresses are constant (for vmcs02), the counts can change based 2154 * on L2's behavior, e.g. switching to/from long mode. 2155 */ 2156 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2157 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2158 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2159 2160 vmx_set_constant_host_state(vmx); 2161 } 2162 2163 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2164 struct vmcs12 *vmcs12) 2165 { 2166 prepare_vmcs02_constant_state(vmx); 2167 2168 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2169 2170 if (enable_vpid) { 2171 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2172 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2173 else 2174 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2175 } 2176 } 2177 2178 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2179 struct vmcs12 *vmcs12) 2180 { 2181 u32 exec_control; 2182 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2183 2184 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2185 prepare_vmcs02_early_rare(vmx, vmcs12); 2186 2187 /* 2188 * PIN CONTROLS 2189 */ 2190 exec_control = __pin_controls_get(vmcs01); 2191 exec_control |= (vmcs12->pin_based_vm_exec_control & 2192 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2193 2194 /* Posted interrupts setting is only taken from vmcs12. */ 2195 vmx->nested.pi_pending = false; 2196 if (nested_cpu_has_posted_intr(vmcs12)) 2197 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2198 else 2199 exec_control &= ~PIN_BASED_POSTED_INTR; 2200 pin_controls_set(vmx, exec_control); 2201 2202 /* 2203 * EXEC CONTROLS 2204 */ 2205 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2206 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2207 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2208 exec_control &= ~CPU_BASED_TPR_SHADOW; 2209 exec_control |= vmcs12->cpu_based_vm_exec_control; 2210 2211 vmx->nested.l1_tpr_threshold = -1; 2212 if (exec_control & CPU_BASED_TPR_SHADOW) 2213 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2214 #ifdef CONFIG_X86_64 2215 else 2216 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2217 CPU_BASED_CR8_STORE_EXITING; 2218 #endif 2219 2220 /* 2221 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2222 * for I/O port accesses. 2223 */ 2224 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2225 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2226 2227 /* 2228 * This bit will be computed in nested_get_vmcs12_pages, because 2229 * we do not have access to L1's MSR bitmap yet. For now, keep 2230 * the same bit as before, hoping to avoid multiple VMWRITEs that 2231 * only set/clear this bit. 2232 */ 2233 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2234 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2235 2236 exec_controls_set(vmx, exec_control); 2237 2238 /* 2239 * SECONDARY EXEC CONTROLS 2240 */ 2241 if (cpu_has_secondary_exec_ctrls()) { 2242 exec_control = __secondary_exec_controls_get(vmcs01); 2243 2244 /* Take the following fields only from vmcs12 */ 2245 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2246 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2247 SECONDARY_EXEC_ENABLE_INVPCID | 2248 SECONDARY_EXEC_ENABLE_RDTSCP | 2249 SECONDARY_EXEC_XSAVES | 2250 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2251 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2252 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2253 SECONDARY_EXEC_ENABLE_VMFUNC | 2254 SECONDARY_EXEC_TSC_SCALING | 2255 SECONDARY_EXEC_DESC); 2256 2257 if (nested_cpu_has(vmcs12, 2258 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2259 exec_control |= vmcs12->secondary_vm_exec_control; 2260 2261 /* PML is emulated and never enabled in hardware for L2. */ 2262 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2263 2264 /* VMCS shadowing for L2 is emulated for now */ 2265 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2266 2267 /* 2268 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2269 * will not have to rewrite the controls just for this bit. 2270 */ 2271 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2272 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2273 exec_control |= SECONDARY_EXEC_DESC; 2274 2275 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2276 vmcs_write16(GUEST_INTR_STATUS, 2277 vmcs12->guest_intr_status); 2278 2279 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2280 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2281 2282 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2283 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2284 2285 secondary_exec_controls_set(vmx, exec_control); 2286 } 2287 2288 /* 2289 * ENTRY CONTROLS 2290 * 2291 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2292 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2293 * on the related bits (if supported by the CPU) in the hope that 2294 * we can avoid VMWrites during vmx_set_efer(). 2295 */ 2296 exec_control = __vm_entry_controls_get(vmcs01); 2297 exec_control |= vmcs12->vm_entry_controls; 2298 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2299 if (cpu_has_load_ia32_efer()) { 2300 if (guest_efer & EFER_LMA) 2301 exec_control |= VM_ENTRY_IA32E_MODE; 2302 if (guest_efer != host_efer) 2303 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2304 } 2305 vm_entry_controls_set(vmx, exec_control); 2306 2307 /* 2308 * EXIT CONTROLS 2309 * 2310 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2311 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2312 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2313 */ 2314 exec_control = __vm_exit_controls_get(vmcs01); 2315 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2316 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2317 else 2318 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2319 vm_exit_controls_set(vmx, exec_control); 2320 2321 /* 2322 * Interrupt/Exception Fields 2323 */ 2324 if (vmx->nested.nested_run_pending) { 2325 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2326 vmcs12->vm_entry_intr_info_field); 2327 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2328 vmcs12->vm_entry_exception_error_code); 2329 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2330 vmcs12->vm_entry_instruction_len); 2331 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2332 vmcs12->guest_interruptibility_info); 2333 vmx->loaded_vmcs->nmi_known_unmasked = 2334 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2335 } else { 2336 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2337 } 2338 } 2339 2340 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2341 { 2342 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2343 2344 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2345 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2346 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2347 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2348 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2349 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2350 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2351 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2352 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2353 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2354 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2355 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2356 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2357 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2358 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2359 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2360 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2361 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2362 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2363 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2364 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2365 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2366 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2367 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2368 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2369 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2370 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2371 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2372 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2373 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2374 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2375 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2376 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2377 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2378 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2379 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2380 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2381 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2382 2383 vmx->segment_cache.bitmask = 0; 2384 } 2385 2386 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2387 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2388 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2389 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2390 vmcs12->guest_pending_dbg_exceptions); 2391 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2392 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2393 2394 /* 2395 * L1 may access the L2's PDPTR, so save them to construct 2396 * vmcs12 2397 */ 2398 if (enable_ept) { 2399 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2400 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2401 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2402 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2403 } 2404 2405 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2406 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2407 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2408 } 2409 2410 if (nested_cpu_has_xsaves(vmcs12)) 2411 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2412 2413 /* 2414 * Whether page-faults are trapped is determined by a combination of 2415 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2416 * doesn't care about page faults then we should set all of these to 2417 * L1's desires. However, if L0 does care about (some) page faults, it 2418 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2419 * simply ask to exit on each and every L2 page fault. This is done by 2420 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2421 * Note that below we don't need special code to set EB.PF beyond the 2422 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2423 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2424 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2425 */ 2426 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2427 /* 2428 * TODO: if both L0 and L1 need the same MASK and MATCH, 2429 * go ahead and use it? 2430 */ 2431 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2432 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2433 } else { 2434 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2435 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2436 } 2437 2438 if (cpu_has_vmx_apicv()) { 2439 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2440 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2441 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2442 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2443 } 2444 2445 /* 2446 * Make sure the msr_autostore list is up to date before we set the 2447 * count in the vmcs02. 2448 */ 2449 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2450 2451 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2452 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2453 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2454 2455 set_cr4_guest_host_mask(vmx); 2456 } 2457 2458 /* 2459 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2460 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2461 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2462 * guest in a way that will both be appropriate to L1's requests, and our 2463 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2464 * function also has additional necessary side-effects, like setting various 2465 * vcpu->arch fields. 2466 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2467 * is assigned to entry_failure_code on failure. 2468 */ 2469 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2470 bool from_vmentry, 2471 enum vm_entry_failure_code *entry_failure_code) 2472 { 2473 struct vcpu_vmx *vmx = to_vmx(vcpu); 2474 bool load_guest_pdptrs_vmcs12 = false; 2475 2476 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 2477 prepare_vmcs02_rare(vmx, vmcs12); 2478 vmx->nested.dirty_vmcs12 = false; 2479 2480 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || 2481 !(vmx->nested.hv_evmcs->hv_clean_fields & 2482 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2483 } 2484 2485 if (vmx->nested.nested_run_pending && 2486 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2487 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2488 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2489 } else { 2490 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2491 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2492 } 2493 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2494 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2495 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2496 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2497 2498 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2499 * bitwise-or of what L1 wants to trap for L2, and what we want to 2500 * trap. Note that CR0.TS also needs updating - we do this later. 2501 */ 2502 vmx_update_exception_bitmap(vcpu); 2503 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2504 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2505 2506 if (vmx->nested.nested_run_pending && 2507 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2508 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2509 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2510 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2511 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2512 } 2513 2514 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2515 vcpu->arch.l1_tsc_offset, 2516 vmx_get_l2_tsc_offset(vcpu), 2517 vmx_get_l2_tsc_multiplier(vcpu)); 2518 2519 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2520 vcpu->arch.l1_tsc_scaling_ratio, 2521 vmx_get_l2_tsc_multiplier(vcpu)); 2522 2523 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2524 if (kvm_has_tsc_control) 2525 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2526 2527 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2528 2529 if (nested_cpu_has_ept(vmcs12)) 2530 nested_ept_init_mmu_context(vcpu); 2531 2532 /* 2533 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2534 * bits which we consider mandatory enabled. 2535 * The CR0_READ_SHADOW is what L2 should have expected to read given 2536 * the specifications by L1; It's not enough to take 2537 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2538 * have more bits than L1 expected. 2539 */ 2540 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2541 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2542 2543 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2544 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2545 2546 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2547 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2548 vmx_set_efer(vcpu, vcpu->arch.efer); 2549 2550 /* 2551 * Guest state is invalid and unrestricted guest is disabled, 2552 * which means L1 attempted VMEntry to L2 with invalid state. 2553 * Fail the VMEntry. 2554 * 2555 * However when force loading the guest state (SMM exit or 2556 * loading nested state after migration, it is possible to 2557 * have invalid guest state now, which will be later fixed by 2558 * restoring L2 register state 2559 */ 2560 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2561 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2562 return -EINVAL; 2563 } 2564 2565 /* Shadow page tables on either EPT or shadow page tables. */ 2566 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2567 from_vmentry, entry_failure_code)) 2568 return -EINVAL; 2569 2570 /* 2571 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2572 * on nested VM-Exit, which can occur without actually running L2 and 2573 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2574 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2575 * transition to HLT instead of running L2. 2576 */ 2577 if (enable_ept) 2578 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2579 2580 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2581 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2582 is_pae_paging(vcpu)) { 2583 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2584 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2585 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2586 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2587 } 2588 2589 if (!enable_ept) 2590 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2591 2592 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2593 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2594 vmcs12->guest_ia32_perf_global_ctrl))) { 2595 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2596 return -EINVAL; 2597 } 2598 2599 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2600 kvm_rip_write(vcpu, vmcs12->guest_rip); 2601 2602 /* 2603 * It was observed that genuine Hyper-V running in L1 doesn't reset 2604 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2605 * bits when it changes a field in eVMCS. Mark all fields as clean 2606 * here. 2607 */ 2608 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2609 vmx->nested.hv_evmcs->hv_clean_fields |= 2610 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2611 2612 return 0; 2613 } 2614 2615 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2616 { 2617 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2618 nested_cpu_has_virtual_nmis(vmcs12))) 2619 return -EINVAL; 2620 2621 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2622 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2623 return -EINVAL; 2624 2625 return 0; 2626 } 2627 2628 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2629 { 2630 struct vcpu_vmx *vmx = to_vmx(vcpu); 2631 2632 /* Check for memory type validity */ 2633 switch (new_eptp & VMX_EPTP_MT_MASK) { 2634 case VMX_EPTP_MT_UC: 2635 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2636 return false; 2637 break; 2638 case VMX_EPTP_MT_WB: 2639 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2640 return false; 2641 break; 2642 default: 2643 return false; 2644 } 2645 2646 /* Page-walk levels validity. */ 2647 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2648 case VMX_EPTP_PWL_5: 2649 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2650 return false; 2651 break; 2652 case VMX_EPTP_PWL_4: 2653 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2654 return false; 2655 break; 2656 default: 2657 return false; 2658 } 2659 2660 /* Reserved bits should not be set */ 2661 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2662 return false; 2663 2664 /* AD, if set, should be supported */ 2665 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2666 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2667 return false; 2668 } 2669 2670 return true; 2671 } 2672 2673 /* 2674 * Checks related to VM-Execution Control Fields 2675 */ 2676 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2677 struct vmcs12 *vmcs12) 2678 { 2679 struct vcpu_vmx *vmx = to_vmx(vcpu); 2680 2681 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2682 vmx->nested.msrs.pinbased_ctls_low, 2683 vmx->nested.msrs.pinbased_ctls_high)) || 2684 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2685 vmx->nested.msrs.procbased_ctls_low, 2686 vmx->nested.msrs.procbased_ctls_high))) 2687 return -EINVAL; 2688 2689 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2690 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2691 vmx->nested.msrs.secondary_ctls_low, 2692 vmx->nested.msrs.secondary_ctls_high))) 2693 return -EINVAL; 2694 2695 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2696 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2697 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2698 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2699 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2700 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2701 nested_vmx_check_nmi_controls(vmcs12) || 2702 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2703 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2704 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2705 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2706 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2707 return -EINVAL; 2708 2709 if (!nested_cpu_has_preemption_timer(vmcs12) && 2710 nested_cpu_has_save_preemption_timer(vmcs12)) 2711 return -EINVAL; 2712 2713 if (nested_cpu_has_ept(vmcs12) && 2714 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2715 return -EINVAL; 2716 2717 if (nested_cpu_has_vmfunc(vmcs12)) { 2718 if (CC(vmcs12->vm_function_control & 2719 ~vmx->nested.msrs.vmfunc_controls)) 2720 return -EINVAL; 2721 2722 if (nested_cpu_has_eptp_switching(vmcs12)) { 2723 if (CC(!nested_cpu_has_ept(vmcs12)) || 2724 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2725 return -EINVAL; 2726 } 2727 } 2728 2729 return 0; 2730 } 2731 2732 /* 2733 * Checks related to VM-Exit Control Fields 2734 */ 2735 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2736 struct vmcs12 *vmcs12) 2737 { 2738 struct vcpu_vmx *vmx = to_vmx(vcpu); 2739 2740 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2741 vmx->nested.msrs.exit_ctls_low, 2742 vmx->nested.msrs.exit_ctls_high)) || 2743 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2744 return -EINVAL; 2745 2746 return 0; 2747 } 2748 2749 /* 2750 * Checks related to VM-Entry Control Fields 2751 */ 2752 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2753 struct vmcs12 *vmcs12) 2754 { 2755 struct vcpu_vmx *vmx = to_vmx(vcpu); 2756 2757 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2758 vmx->nested.msrs.entry_ctls_low, 2759 vmx->nested.msrs.entry_ctls_high))) 2760 return -EINVAL; 2761 2762 /* 2763 * From the Intel SDM, volume 3: 2764 * Fields relevant to VM-entry event injection must be set properly. 2765 * These fields are the VM-entry interruption-information field, the 2766 * VM-entry exception error code, and the VM-entry instruction length. 2767 */ 2768 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2769 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2770 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2771 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2772 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2773 bool should_have_error_code; 2774 bool urg = nested_cpu_has2(vmcs12, 2775 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2776 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2777 2778 /* VM-entry interruption-info field: interruption type */ 2779 if (CC(intr_type == INTR_TYPE_RESERVED) || 2780 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2781 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2782 return -EINVAL; 2783 2784 /* VM-entry interruption-info field: vector */ 2785 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2786 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2787 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2788 return -EINVAL; 2789 2790 /* VM-entry interruption-info field: deliver error code */ 2791 should_have_error_code = 2792 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2793 x86_exception_has_error_code(vector); 2794 if (CC(has_error_code != should_have_error_code)) 2795 return -EINVAL; 2796 2797 /* VM-entry exception error code */ 2798 if (CC(has_error_code && 2799 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2800 return -EINVAL; 2801 2802 /* VM-entry interruption-info field: reserved bits */ 2803 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2804 return -EINVAL; 2805 2806 /* VM-entry instruction length */ 2807 switch (intr_type) { 2808 case INTR_TYPE_SOFT_EXCEPTION: 2809 case INTR_TYPE_SOFT_INTR: 2810 case INTR_TYPE_PRIV_SW_EXCEPTION: 2811 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2812 CC(vmcs12->vm_entry_instruction_len == 0 && 2813 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2814 return -EINVAL; 2815 } 2816 } 2817 2818 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2819 return -EINVAL; 2820 2821 return 0; 2822 } 2823 2824 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2825 struct vmcs12 *vmcs12) 2826 { 2827 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2828 nested_check_vm_exit_controls(vcpu, vmcs12) || 2829 nested_check_vm_entry_controls(vcpu, vmcs12)) 2830 return -EINVAL; 2831 2832 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) 2833 return nested_evmcs_check_controls(vmcs12); 2834 2835 return 0; 2836 } 2837 2838 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2839 struct vmcs12 *vmcs12) 2840 { 2841 #ifdef CONFIG_X86_64 2842 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2843 !!(vcpu->arch.efer & EFER_LMA))) 2844 return -EINVAL; 2845 #endif 2846 return 0; 2847 } 2848 2849 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2850 struct vmcs12 *vmcs12) 2851 { 2852 bool ia32e; 2853 2854 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2855 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2856 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2857 return -EINVAL; 2858 2859 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2860 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2861 return -EINVAL; 2862 2863 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2864 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2865 return -EINVAL; 2866 2867 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2868 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2869 vmcs12->host_ia32_perf_global_ctrl))) 2870 return -EINVAL; 2871 2872 #ifdef CONFIG_X86_64 2873 ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2874 #else 2875 ia32e = false; 2876 #endif 2877 2878 if (ia32e) { 2879 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2880 return -EINVAL; 2881 } else { 2882 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2883 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2884 CC((vmcs12->host_rip) >> 32)) 2885 return -EINVAL; 2886 } 2887 2888 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2889 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2890 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2891 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2892 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2893 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2894 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2895 CC(vmcs12->host_cs_selector == 0) || 2896 CC(vmcs12->host_tr_selector == 0) || 2897 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2898 return -EINVAL; 2899 2900 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2901 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2902 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2903 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2904 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2905 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2906 return -EINVAL; 2907 2908 /* 2909 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2910 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2911 * the values of the LMA and LME bits in the field must each be that of 2912 * the host address-space size VM-exit control. 2913 */ 2914 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2915 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2916 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2917 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2918 return -EINVAL; 2919 } 2920 2921 return 0; 2922 } 2923 2924 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2925 struct vmcs12 *vmcs12) 2926 { 2927 struct vcpu_vmx *vmx = to_vmx(vcpu); 2928 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 2929 struct vmcs_hdr hdr; 2930 2931 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 2932 return 0; 2933 2934 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2935 return -EINVAL; 2936 2937 if (ghc->gpa != vmcs12->vmcs_link_pointer && 2938 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 2939 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 2940 return -EINVAL; 2941 2942 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 2943 offsetof(struct vmcs12, hdr), 2944 sizeof(hdr)))) 2945 return -EINVAL; 2946 2947 if (CC(hdr.revision_id != VMCS12_REVISION) || 2948 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2949 return -EINVAL; 2950 2951 return 0; 2952 } 2953 2954 /* 2955 * Checks related to Guest Non-register State 2956 */ 2957 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2958 { 2959 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2960 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 2961 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 2962 return -EINVAL; 2963 2964 return 0; 2965 } 2966 2967 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2968 struct vmcs12 *vmcs12, 2969 enum vm_entry_failure_code *entry_failure_code) 2970 { 2971 bool ia32e; 2972 2973 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2974 2975 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2976 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2977 return -EINVAL; 2978 2979 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 2980 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 2981 return -EINVAL; 2982 2983 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2984 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2985 return -EINVAL; 2986 2987 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2988 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 2989 return -EINVAL; 2990 } 2991 2992 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2993 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2994 vmcs12->guest_ia32_perf_global_ctrl))) 2995 return -EINVAL; 2996 2997 /* 2998 * If the load IA32_EFER VM-entry control is 1, the following checks 2999 * are performed on the field for the IA32_EFER MSR: 3000 * - Bits reserved in the IA32_EFER MSR must be 0. 3001 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3002 * the IA-32e mode guest VM-exit control. It must also be identical 3003 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3004 * CR0.PG) is 1. 3005 */ 3006 if (to_vmx(vcpu)->nested.nested_run_pending && 3007 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3008 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 3009 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3010 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3011 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3012 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3013 return -EINVAL; 3014 } 3015 3016 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3017 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3018 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3019 return -EINVAL; 3020 3021 if (nested_check_guest_non_reg_state(vmcs12)) 3022 return -EINVAL; 3023 3024 return 0; 3025 } 3026 3027 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3028 { 3029 struct vcpu_vmx *vmx = to_vmx(vcpu); 3030 unsigned long cr3, cr4; 3031 bool vm_fail; 3032 3033 if (!nested_early_check) 3034 return 0; 3035 3036 if (vmx->msr_autoload.host.nr) 3037 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3038 if (vmx->msr_autoload.guest.nr) 3039 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3040 3041 preempt_disable(); 3042 3043 vmx_prepare_switch_to_guest(vcpu); 3044 3045 /* 3046 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3047 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3048 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3049 * there is no need to preserve other bits or save/restore the field. 3050 */ 3051 vmcs_writel(GUEST_RFLAGS, 0); 3052 3053 cr3 = __get_current_cr3_fast(); 3054 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3055 vmcs_writel(HOST_CR3, cr3); 3056 vmx->loaded_vmcs->host_state.cr3 = cr3; 3057 } 3058 3059 cr4 = cr4_read_shadow(); 3060 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3061 vmcs_writel(HOST_CR4, cr4); 3062 vmx->loaded_vmcs->host_state.cr4 = cr4; 3063 } 3064 3065 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3066 vmx->loaded_vmcs->launched); 3067 3068 if (vmx->msr_autoload.host.nr) 3069 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3070 if (vmx->msr_autoload.guest.nr) 3071 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3072 3073 if (vm_fail) { 3074 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3075 3076 preempt_enable(); 3077 3078 trace_kvm_nested_vmenter_failed( 3079 "early hardware check VM-instruction error: ", error); 3080 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3081 return 1; 3082 } 3083 3084 /* 3085 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3086 */ 3087 if (hw_breakpoint_active()) 3088 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3089 local_irq_enable(); 3090 preempt_enable(); 3091 3092 /* 3093 * A non-failing VMEntry means we somehow entered guest mode with 3094 * an illegal RIP, and that's just the tip of the iceberg. There 3095 * is no telling what memory has been modified or what state has 3096 * been exposed to unknown code. Hitting this all but guarantees 3097 * a (very critical) hardware issue. 3098 */ 3099 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3100 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3101 3102 return 0; 3103 } 3104 3105 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3106 { 3107 struct vcpu_vmx *vmx = to_vmx(vcpu); 3108 3109 /* 3110 * hv_evmcs may end up being not mapped after migration (when 3111 * L2 was running), map it here to make sure vmcs12 changes are 3112 * properly reflected. 3113 */ 3114 if (vmx->nested.enlightened_vmcs_enabled && 3115 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3116 enum nested_evmptrld_status evmptrld_status = 3117 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3118 3119 if (evmptrld_status == EVMPTRLD_VMFAIL || 3120 evmptrld_status == EVMPTRLD_ERROR) 3121 return false; 3122 3123 /* 3124 * Post migration VMCS12 always provides the most actual 3125 * information, copy it to eVMCS upon entry. 3126 */ 3127 vmx->nested.need_vmcs12_to_shadow_sync = true; 3128 } 3129 3130 return true; 3131 } 3132 3133 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3134 { 3135 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3136 struct vcpu_vmx *vmx = to_vmx(vcpu); 3137 struct kvm_host_map *map; 3138 struct page *page; 3139 u64 hpa; 3140 3141 if (!vcpu->arch.pdptrs_from_userspace && 3142 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3143 /* 3144 * Reload the guest's PDPTRs since after a migration 3145 * the guest CR3 might be restored prior to setting the nested 3146 * state which can lead to a load of wrong PDPTRs. 3147 */ 3148 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) 3149 return false; 3150 } 3151 3152 3153 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3154 /* 3155 * Translate L1 physical address to host physical 3156 * address for vmcs02. Keep the page pinned, so this 3157 * physical address remains valid. We keep a reference 3158 * to it so we can release it later. 3159 */ 3160 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 3161 kvm_release_page_clean(vmx->nested.apic_access_page); 3162 vmx->nested.apic_access_page = NULL; 3163 } 3164 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 3165 if (!is_error_page(page)) { 3166 vmx->nested.apic_access_page = page; 3167 hpa = page_to_phys(vmx->nested.apic_access_page); 3168 vmcs_write64(APIC_ACCESS_ADDR, hpa); 3169 } else { 3170 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 3171 __func__); 3172 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3173 vcpu->run->internal.suberror = 3174 KVM_INTERNAL_ERROR_EMULATION; 3175 vcpu->run->internal.ndata = 0; 3176 return false; 3177 } 3178 } 3179 3180 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3181 map = &vmx->nested.virtual_apic_map; 3182 3183 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3184 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3185 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3186 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3187 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3188 /* 3189 * The processor will never use the TPR shadow, simply 3190 * clear the bit from the execution control. Such a 3191 * configuration is useless, but it happens in tests. 3192 * For any other configuration, failing the vm entry is 3193 * _not_ what the processor does but it's basically the 3194 * only possibility we have. 3195 */ 3196 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3197 } else { 3198 /* 3199 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3200 * force VM-Entry to fail. 3201 */ 3202 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3203 } 3204 } 3205 3206 if (nested_cpu_has_posted_intr(vmcs12)) { 3207 map = &vmx->nested.pi_desc_map; 3208 3209 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3210 vmx->nested.pi_desc = 3211 (struct pi_desc *)(((void *)map->hva) + 3212 offset_in_page(vmcs12->posted_intr_desc_addr)); 3213 vmcs_write64(POSTED_INTR_DESC_ADDR, 3214 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3215 } else { 3216 /* 3217 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3218 * access the contents of the VMCS12 posted interrupt 3219 * descriptor. (Note that KVM may do this when it 3220 * should not, per the architectural specification.) 3221 */ 3222 vmx->nested.pi_desc = NULL; 3223 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3224 } 3225 } 3226 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3227 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3228 else 3229 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3230 3231 return true; 3232 } 3233 3234 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3235 { 3236 if (!nested_get_evmcs_page(vcpu)) { 3237 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3238 __func__); 3239 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3240 vcpu->run->internal.suberror = 3241 KVM_INTERNAL_ERROR_EMULATION; 3242 vcpu->run->internal.ndata = 0; 3243 3244 return false; 3245 } 3246 3247 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3248 return false; 3249 3250 return true; 3251 } 3252 3253 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3254 { 3255 struct vmcs12 *vmcs12; 3256 struct vcpu_vmx *vmx = to_vmx(vcpu); 3257 gpa_t dst; 3258 3259 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3260 return 0; 3261 3262 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3263 return 1; 3264 3265 /* 3266 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3267 * set is already checked as part of A/D emulation. 3268 */ 3269 vmcs12 = get_vmcs12(vcpu); 3270 if (!nested_cpu_has_pml(vmcs12)) 3271 return 0; 3272 3273 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3274 vmx->nested.pml_full = true; 3275 return 1; 3276 } 3277 3278 gpa &= ~0xFFFull; 3279 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3280 3281 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3282 offset_in_page(dst), sizeof(gpa))) 3283 return 0; 3284 3285 vmcs12->guest_pml_index--; 3286 3287 return 0; 3288 } 3289 3290 /* 3291 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3292 * for running VMX instructions (except VMXON, whose prerequisites are 3293 * slightly different). It also specifies what exception to inject otherwise. 3294 * Note that many of these exceptions have priority over VM exits, so they 3295 * don't have to be checked again here. 3296 */ 3297 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3298 { 3299 if (!to_vmx(vcpu)->nested.vmxon) { 3300 kvm_queue_exception(vcpu, UD_VECTOR); 3301 return 0; 3302 } 3303 3304 if (vmx_get_cpl(vcpu)) { 3305 kvm_inject_gp(vcpu, 0); 3306 return 0; 3307 } 3308 3309 return 1; 3310 } 3311 3312 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3313 { 3314 u8 rvi = vmx_get_rvi(); 3315 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3316 3317 return ((rvi & 0xf0) > (vppr & 0xf0)); 3318 } 3319 3320 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3321 struct vmcs12 *vmcs12); 3322 3323 /* 3324 * If from_vmentry is false, this is being called from state restore (either RSM 3325 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3326 * 3327 * Returns: 3328 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3329 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3330 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3331 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3332 */ 3333 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3334 bool from_vmentry) 3335 { 3336 struct vcpu_vmx *vmx = to_vmx(vcpu); 3337 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3338 enum vm_entry_failure_code entry_failure_code; 3339 bool evaluate_pending_interrupts; 3340 union vmx_exit_reason exit_reason = { 3341 .basic = EXIT_REASON_INVALID_STATE, 3342 .failed_vmentry = 1, 3343 }; 3344 u32 failed_index; 3345 3346 kvm_service_local_tlb_flush_requests(vcpu); 3347 3348 evaluate_pending_interrupts = exec_controls_get(vmx) & 3349 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3350 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3351 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3352 3353 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3354 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3355 if (kvm_mpx_supported() && 3356 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3357 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3358 3359 /* 3360 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3361 * nested early checks are disabled. In the event of a "late" VM-Fail, 3362 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3363 * software model to the pre-VMEntry host state. When EPT is disabled, 3364 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3365 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3366 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3367 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3368 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3369 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3370 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3371 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3372 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3373 * path would need to manually save/restore vmcs01.GUEST_CR3. 3374 */ 3375 if (!enable_ept && !nested_early_check) 3376 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3377 3378 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3379 3380 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3381 3382 if (from_vmentry) { 3383 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3384 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3385 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3386 } 3387 3388 if (nested_vmx_check_vmentry_hw(vcpu)) { 3389 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3390 return NVMX_VMENTRY_VMFAIL; 3391 } 3392 3393 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3394 &entry_failure_code)) { 3395 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3396 vmcs12->exit_qualification = entry_failure_code; 3397 goto vmentry_fail_vmexit; 3398 } 3399 } 3400 3401 enter_guest_mode(vcpu); 3402 3403 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3404 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3405 vmcs12->exit_qualification = entry_failure_code; 3406 goto vmentry_fail_vmexit_guest_mode; 3407 } 3408 3409 if (from_vmentry) { 3410 failed_index = nested_vmx_load_msr(vcpu, 3411 vmcs12->vm_entry_msr_load_addr, 3412 vmcs12->vm_entry_msr_load_count); 3413 if (failed_index) { 3414 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3415 vmcs12->exit_qualification = failed_index; 3416 goto vmentry_fail_vmexit_guest_mode; 3417 } 3418 } else { 3419 /* 3420 * The MMU is not initialized to point at the right entities yet and 3421 * "get pages" would need to read data from the guest (i.e. we will 3422 * need to perform gpa to hpa translation). Request a call 3423 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3424 * have already been set at vmentry time and should not be reset. 3425 */ 3426 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3427 } 3428 3429 /* 3430 * If L1 had a pending IRQ/NMI until it executed 3431 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3432 * disallowed (e.g. interrupts disabled), L0 needs to 3433 * evaluate if this pending event should cause an exit from L2 3434 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3435 * intercept EXTERNAL_INTERRUPT). 3436 * 3437 * Usually this would be handled by the processor noticing an 3438 * IRQ/NMI window request, or checking RVI during evaluation of 3439 * pending virtual interrupts. However, this setting was done 3440 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3441 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3442 */ 3443 if (unlikely(evaluate_pending_interrupts)) 3444 kvm_make_request(KVM_REQ_EVENT, vcpu); 3445 3446 /* 3447 * Do not start the preemption timer hrtimer until after we know 3448 * we are successful, so that only nested_vmx_vmexit needs to cancel 3449 * the timer. 3450 */ 3451 vmx->nested.preemption_timer_expired = false; 3452 if (nested_cpu_has_preemption_timer(vmcs12)) { 3453 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3454 vmx_start_preemption_timer(vcpu, timer_value); 3455 } 3456 3457 /* 3458 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3459 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3460 * returned as far as L1 is concerned. It will only return (and set 3461 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3462 */ 3463 return NVMX_VMENTRY_SUCCESS; 3464 3465 /* 3466 * A failed consistency check that leads to a VMExit during L1's 3467 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3468 * 26.7 "VM-entry failures during or after loading guest state". 3469 */ 3470 vmentry_fail_vmexit_guest_mode: 3471 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3472 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3473 leave_guest_mode(vcpu); 3474 3475 vmentry_fail_vmexit: 3476 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3477 3478 if (!from_vmentry) 3479 return NVMX_VMENTRY_VMEXIT; 3480 3481 load_vmcs12_host_state(vcpu, vmcs12); 3482 vmcs12->vm_exit_reason = exit_reason.full; 3483 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 3484 vmx->nested.need_vmcs12_to_shadow_sync = true; 3485 return NVMX_VMENTRY_VMEXIT; 3486 } 3487 3488 /* 3489 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3490 * for running an L2 nested guest. 3491 */ 3492 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3493 { 3494 struct vmcs12 *vmcs12; 3495 enum nvmx_vmentry_status status; 3496 struct vcpu_vmx *vmx = to_vmx(vcpu); 3497 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3498 enum nested_evmptrld_status evmptrld_status; 3499 3500 if (!nested_vmx_check_permission(vcpu)) 3501 return 1; 3502 3503 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3504 if (evmptrld_status == EVMPTRLD_ERROR) { 3505 kvm_queue_exception(vcpu, UD_VECTOR); 3506 return 1; 3507 } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { 3508 return nested_vmx_failInvalid(vcpu); 3509 } 3510 3511 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && 3512 vmx->nested.current_vmptr == INVALID_GPA)) 3513 return nested_vmx_failInvalid(vcpu); 3514 3515 vmcs12 = get_vmcs12(vcpu); 3516 3517 /* 3518 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3519 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3520 * rather than RFLAGS.ZF, and no error number is stored to the 3521 * VM-instruction error field. 3522 */ 3523 if (CC(vmcs12->hdr.shadow_vmcs)) 3524 return nested_vmx_failInvalid(vcpu); 3525 3526 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 3527 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); 3528 /* Enlightened VMCS doesn't have launch state */ 3529 vmcs12->launch_state = !launch; 3530 } else if (enable_shadow_vmcs) { 3531 copy_shadow_to_vmcs12(vmx); 3532 } 3533 3534 /* 3535 * The nested entry process starts with enforcing various prerequisites 3536 * on vmcs12 as required by the Intel SDM, and act appropriately when 3537 * they fail: As the SDM explains, some conditions should cause the 3538 * instruction to fail, while others will cause the instruction to seem 3539 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3540 * To speed up the normal (success) code path, we should avoid checking 3541 * for misconfigurations which will anyway be caught by the processor 3542 * when using the merged vmcs02. 3543 */ 3544 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3545 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3546 3547 if (CC(vmcs12->launch_state == launch)) 3548 return nested_vmx_fail(vcpu, 3549 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3550 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3551 3552 if (nested_vmx_check_controls(vcpu, vmcs12)) 3553 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3554 3555 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3556 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3557 3558 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3559 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3560 3561 /* 3562 * We're finally done with prerequisite checking, and can start with 3563 * the nested entry. 3564 */ 3565 vmx->nested.nested_run_pending = 1; 3566 vmx->nested.has_preemption_timer_deadline = false; 3567 status = nested_vmx_enter_non_root_mode(vcpu, true); 3568 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3569 goto vmentry_failed; 3570 3571 /* Emulate processing of posted interrupts on VM-Enter. */ 3572 if (nested_cpu_has_posted_intr(vmcs12) && 3573 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3574 vmx->nested.pi_pending = true; 3575 kvm_make_request(KVM_REQ_EVENT, vcpu); 3576 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3577 } 3578 3579 /* Hide L1D cache contents from the nested guest. */ 3580 vmx->vcpu.arch.l1tf_flush_l1d = true; 3581 3582 /* 3583 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3584 * also be used as part of restoring nVMX state for 3585 * snapshot restore (migration). 3586 * 3587 * In this flow, it is assumed that vmcs12 cache was 3588 * transferred as part of captured nVMX state and should 3589 * therefore not be read from guest memory (which may not 3590 * exist on destination host yet). 3591 */ 3592 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3593 3594 switch (vmcs12->guest_activity_state) { 3595 case GUEST_ACTIVITY_HLT: 3596 /* 3597 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3598 * awakened by event injection or by an NMI-window VM-exit or 3599 * by an interrupt-window VM-exit, halt the vcpu. 3600 */ 3601 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3602 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3603 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3604 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3605 vmx->nested.nested_run_pending = 0; 3606 return kvm_vcpu_halt(vcpu); 3607 } 3608 break; 3609 case GUEST_ACTIVITY_WAIT_SIPI: 3610 vmx->nested.nested_run_pending = 0; 3611 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3612 break; 3613 default: 3614 break; 3615 } 3616 3617 return 1; 3618 3619 vmentry_failed: 3620 vmx->nested.nested_run_pending = 0; 3621 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3622 return 0; 3623 if (status == NVMX_VMENTRY_VMEXIT) 3624 return 1; 3625 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3626 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3627 } 3628 3629 /* 3630 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3631 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3632 * This function returns the new value we should put in vmcs12.guest_cr0. 3633 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3634 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3635 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3636 * didn't trap the bit, because if L1 did, so would L0). 3637 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3638 * been modified by L2, and L1 knows it. So just leave the old value of 3639 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3640 * isn't relevant, because if L0 traps this bit it can set it to anything. 3641 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3642 * changed these bits, and therefore they need to be updated, but L0 3643 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3644 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3645 */ 3646 static inline unsigned long 3647 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3648 { 3649 return 3650 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3651 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3652 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3653 vcpu->arch.cr0_guest_owned_bits)); 3654 } 3655 3656 static inline unsigned long 3657 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3658 { 3659 return 3660 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3661 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3662 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3663 vcpu->arch.cr4_guest_owned_bits)); 3664 } 3665 3666 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3667 struct vmcs12 *vmcs12) 3668 { 3669 u32 idt_vectoring; 3670 unsigned int nr; 3671 3672 if (vcpu->arch.exception.injected) { 3673 nr = vcpu->arch.exception.nr; 3674 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3675 3676 if (kvm_exception_is_soft(nr)) { 3677 vmcs12->vm_exit_instruction_len = 3678 vcpu->arch.event_exit_inst_len; 3679 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3680 } else 3681 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3682 3683 if (vcpu->arch.exception.has_error_code) { 3684 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3685 vmcs12->idt_vectoring_error_code = 3686 vcpu->arch.exception.error_code; 3687 } 3688 3689 vmcs12->idt_vectoring_info_field = idt_vectoring; 3690 } else if (vcpu->arch.nmi_injected) { 3691 vmcs12->idt_vectoring_info_field = 3692 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3693 } else if (vcpu->arch.interrupt.injected) { 3694 nr = vcpu->arch.interrupt.nr; 3695 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3696 3697 if (vcpu->arch.interrupt.soft) { 3698 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3699 vmcs12->vm_entry_instruction_len = 3700 vcpu->arch.event_exit_inst_len; 3701 } else 3702 idt_vectoring |= INTR_TYPE_EXT_INTR; 3703 3704 vmcs12->idt_vectoring_info_field = idt_vectoring; 3705 } 3706 } 3707 3708 3709 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3710 { 3711 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3712 gfn_t gfn; 3713 3714 /* 3715 * Don't need to mark the APIC access page dirty; it is never 3716 * written to by the CPU during APIC virtualization. 3717 */ 3718 3719 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3720 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3721 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3722 } 3723 3724 if (nested_cpu_has_posted_intr(vmcs12)) { 3725 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3726 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3727 } 3728 } 3729 3730 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3731 { 3732 struct vcpu_vmx *vmx = to_vmx(vcpu); 3733 int max_irr; 3734 void *vapic_page; 3735 u16 status; 3736 3737 if (!vmx->nested.pi_pending) 3738 return 0; 3739 3740 if (!vmx->nested.pi_desc) 3741 goto mmio_needed; 3742 3743 vmx->nested.pi_pending = false; 3744 3745 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3746 return 0; 3747 3748 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3749 if (max_irr != 256) { 3750 vapic_page = vmx->nested.virtual_apic_map.hva; 3751 if (!vapic_page) 3752 goto mmio_needed; 3753 3754 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3755 vapic_page, &max_irr); 3756 status = vmcs_read16(GUEST_INTR_STATUS); 3757 if ((u8)max_irr > ((u8)status & 0xff)) { 3758 status &= ~0xff; 3759 status |= (u8)max_irr; 3760 vmcs_write16(GUEST_INTR_STATUS, status); 3761 } 3762 } 3763 3764 nested_mark_vmcs12_pages_dirty(vcpu); 3765 return 0; 3766 3767 mmio_needed: 3768 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3769 return -ENXIO; 3770 } 3771 3772 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3773 unsigned long exit_qual) 3774 { 3775 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3776 unsigned int nr = vcpu->arch.exception.nr; 3777 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3778 3779 if (vcpu->arch.exception.has_error_code) { 3780 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3781 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3782 } 3783 3784 if (kvm_exception_is_soft(nr)) 3785 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3786 else 3787 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3788 3789 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3790 vmx_get_nmi_mask(vcpu)) 3791 intr_info |= INTR_INFO_UNBLOCK_NMI; 3792 3793 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3794 } 3795 3796 /* 3797 * Returns true if a debug trap is pending delivery. 3798 * 3799 * In KVM, debug traps bear an exception payload. As such, the class of a #DB 3800 * exception may be inferred from the presence of an exception payload. 3801 */ 3802 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) 3803 { 3804 return vcpu->arch.exception.pending && 3805 vcpu->arch.exception.nr == DB_VECTOR && 3806 vcpu->arch.exception.payload; 3807 } 3808 3809 /* 3810 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3811 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3812 * represents these debug traps with a payload that is said to be compatible 3813 * with the 'pending debug exceptions' field, write the payload to the VMCS 3814 * field if a VM-exit is delivered before the debug trap. 3815 */ 3816 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3817 { 3818 if (vmx_pending_dbg_trap(vcpu)) 3819 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 3820 vcpu->arch.exception.payload); 3821 } 3822 3823 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3824 { 3825 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3826 to_vmx(vcpu)->nested.preemption_timer_expired; 3827 } 3828 3829 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 3830 { 3831 struct vcpu_vmx *vmx = to_vmx(vcpu); 3832 unsigned long exit_qual; 3833 bool block_nested_events = 3834 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3835 bool mtf_pending = vmx->nested.mtf_pending; 3836 struct kvm_lapic *apic = vcpu->arch.apic; 3837 3838 /* 3839 * Clear the MTF state. If a higher priority VM-exit is delivered first, 3840 * this state is discarded. 3841 */ 3842 if (!block_nested_events) 3843 vmx->nested.mtf_pending = false; 3844 3845 if (lapic_in_kernel(vcpu) && 3846 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3847 if (block_nested_events) 3848 return -EBUSY; 3849 nested_vmx_update_pending_dbg(vcpu); 3850 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3851 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 3852 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3853 return 0; 3854 } 3855 3856 if (lapic_in_kernel(vcpu) && 3857 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 3858 if (block_nested_events) 3859 return -EBUSY; 3860 3861 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 3862 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 3863 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 3864 apic->sipi_vector & 0xFFUL); 3865 return 0; 3866 } 3867 3868 /* 3869 * Process any exceptions that are not debug traps before MTF. 3870 * 3871 * Note that only a pending nested run can block a pending exception. 3872 * Otherwise an injected NMI/interrupt should either be 3873 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, 3874 * while delivering the pending exception. 3875 */ 3876 3877 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { 3878 if (vmx->nested.nested_run_pending) 3879 return -EBUSY; 3880 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3881 goto no_vmexit; 3882 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3883 return 0; 3884 } 3885 3886 if (mtf_pending) { 3887 if (block_nested_events) 3888 return -EBUSY; 3889 nested_vmx_update_pending_dbg(vcpu); 3890 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 3891 return 0; 3892 } 3893 3894 if (vcpu->arch.exception.pending) { 3895 if (vmx->nested.nested_run_pending) 3896 return -EBUSY; 3897 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3898 goto no_vmexit; 3899 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3900 return 0; 3901 } 3902 3903 if (nested_vmx_preemption_timer_pending(vcpu)) { 3904 if (block_nested_events) 3905 return -EBUSY; 3906 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3907 return 0; 3908 } 3909 3910 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 3911 if (block_nested_events) 3912 return -EBUSY; 3913 goto no_vmexit; 3914 } 3915 3916 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 3917 if (block_nested_events) 3918 return -EBUSY; 3919 if (!nested_exit_on_nmi(vcpu)) 3920 goto no_vmexit; 3921 3922 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3923 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3924 INTR_INFO_VALID_MASK, 0); 3925 /* 3926 * The NMI-triggered VM exit counts as injection: 3927 * clear this one and block further NMIs. 3928 */ 3929 vcpu->arch.nmi_pending = 0; 3930 vmx_set_nmi_mask(vcpu, true); 3931 return 0; 3932 } 3933 3934 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 3935 if (block_nested_events) 3936 return -EBUSY; 3937 if (!nested_exit_on_intr(vcpu)) 3938 goto no_vmexit; 3939 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3940 return 0; 3941 } 3942 3943 no_vmexit: 3944 return vmx_complete_nested_posted_interrupt(vcpu); 3945 } 3946 3947 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3948 { 3949 ktime_t remaining = 3950 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3951 u64 value; 3952 3953 if (ktime_to_ns(remaining) <= 0) 3954 return 0; 3955 3956 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3957 do_div(value, 1000000); 3958 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3959 } 3960 3961 static bool is_vmcs12_ext_field(unsigned long field) 3962 { 3963 switch (field) { 3964 case GUEST_ES_SELECTOR: 3965 case GUEST_CS_SELECTOR: 3966 case GUEST_SS_SELECTOR: 3967 case GUEST_DS_SELECTOR: 3968 case GUEST_FS_SELECTOR: 3969 case GUEST_GS_SELECTOR: 3970 case GUEST_LDTR_SELECTOR: 3971 case GUEST_TR_SELECTOR: 3972 case GUEST_ES_LIMIT: 3973 case GUEST_CS_LIMIT: 3974 case GUEST_SS_LIMIT: 3975 case GUEST_DS_LIMIT: 3976 case GUEST_FS_LIMIT: 3977 case GUEST_GS_LIMIT: 3978 case GUEST_LDTR_LIMIT: 3979 case GUEST_TR_LIMIT: 3980 case GUEST_GDTR_LIMIT: 3981 case GUEST_IDTR_LIMIT: 3982 case GUEST_ES_AR_BYTES: 3983 case GUEST_DS_AR_BYTES: 3984 case GUEST_FS_AR_BYTES: 3985 case GUEST_GS_AR_BYTES: 3986 case GUEST_LDTR_AR_BYTES: 3987 case GUEST_TR_AR_BYTES: 3988 case GUEST_ES_BASE: 3989 case GUEST_CS_BASE: 3990 case GUEST_SS_BASE: 3991 case GUEST_DS_BASE: 3992 case GUEST_FS_BASE: 3993 case GUEST_GS_BASE: 3994 case GUEST_LDTR_BASE: 3995 case GUEST_TR_BASE: 3996 case GUEST_GDTR_BASE: 3997 case GUEST_IDTR_BASE: 3998 case GUEST_PENDING_DBG_EXCEPTIONS: 3999 case GUEST_BNDCFGS: 4000 return true; 4001 default: 4002 break; 4003 } 4004 4005 return false; 4006 } 4007 4008 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4009 struct vmcs12 *vmcs12) 4010 { 4011 struct vcpu_vmx *vmx = to_vmx(vcpu); 4012 4013 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4014 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4015 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4016 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4017 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4018 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4019 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4020 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4021 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4022 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4023 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4024 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4025 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4026 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4027 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4028 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4029 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4030 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4031 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4032 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4033 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4034 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4035 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4036 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4037 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4038 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4039 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4040 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4041 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4042 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4043 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4044 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4045 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4046 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4047 vmcs12->guest_pending_dbg_exceptions = 4048 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4049 if (kvm_mpx_supported()) 4050 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 4051 4052 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4053 } 4054 4055 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4056 struct vmcs12 *vmcs12) 4057 { 4058 struct vcpu_vmx *vmx = to_vmx(vcpu); 4059 int cpu; 4060 4061 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4062 return; 4063 4064 4065 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4066 4067 cpu = get_cpu(); 4068 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4069 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4070 4071 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4072 4073 vmx->loaded_vmcs = &vmx->vmcs01; 4074 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4075 put_cpu(); 4076 } 4077 4078 /* 4079 * Update the guest state fields of vmcs12 to reflect changes that 4080 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4081 * VM-entry controls is also updated, since this is really a guest 4082 * state bit.) 4083 */ 4084 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4085 { 4086 struct vcpu_vmx *vmx = to_vmx(vcpu); 4087 4088 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 4089 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4090 4091 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4092 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); 4093 4094 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4095 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4096 4097 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4098 vmcs12->guest_rip = kvm_rip_read(vcpu); 4099 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4100 4101 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4102 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4103 4104 vmcs12->guest_interruptibility_info = 4105 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4106 4107 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4108 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4109 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4110 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4111 else 4112 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4113 4114 if (nested_cpu_has_preemption_timer(vmcs12) && 4115 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4116 !vmx->nested.nested_run_pending) 4117 vmcs12->vmx_preemption_timer_value = 4118 vmx_get_preemption_timer_value(vcpu); 4119 4120 /* 4121 * In some cases (usually, nested EPT), L2 is allowed to change its 4122 * own CR3 without exiting. If it has changed it, we must keep it. 4123 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4124 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4125 * 4126 * Additionally, restore L2's PDPTR to vmcs12. 4127 */ 4128 if (enable_ept) { 4129 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4130 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4131 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4132 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4133 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4134 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4135 } 4136 } 4137 4138 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4139 4140 if (nested_cpu_has_vid(vmcs12)) 4141 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4142 4143 vmcs12->vm_entry_controls = 4144 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4145 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4146 4147 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4148 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4149 4150 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4151 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4152 } 4153 4154 /* 4155 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4156 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4157 * and this function updates it to reflect the changes to the guest state while 4158 * L2 was running (and perhaps made some exits which were handled directly by L0 4159 * without going back to L1), and to reflect the exit reason. 4160 * Note that we do not have to copy here all VMCS fields, just those that 4161 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4162 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4163 * which already writes to vmcs12 directly. 4164 */ 4165 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4166 u32 vm_exit_reason, u32 exit_intr_info, 4167 unsigned long exit_qualification) 4168 { 4169 /* update exit information fields: */ 4170 vmcs12->vm_exit_reason = vm_exit_reason; 4171 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4172 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4173 vmcs12->exit_qualification = exit_qualification; 4174 vmcs12->vm_exit_intr_info = exit_intr_info; 4175 4176 vmcs12->idt_vectoring_info_field = 0; 4177 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4178 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4179 4180 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4181 vmcs12->launch_state = 1; 4182 4183 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4184 * instead of reading the real value. */ 4185 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4186 4187 /* 4188 * Transfer the event that L0 or L1 may wanted to inject into 4189 * L2 to IDT_VECTORING_INFO_FIELD. 4190 */ 4191 vmcs12_save_pending_event(vcpu, vmcs12); 4192 4193 /* 4194 * According to spec, there's no need to store the guest's 4195 * MSRs if the exit is due to a VM-entry failure that occurs 4196 * during or after loading the guest state. Since this exit 4197 * does not fall in that category, we need to save the MSRs. 4198 */ 4199 if (nested_vmx_store_msr(vcpu, 4200 vmcs12->vm_exit_msr_store_addr, 4201 vmcs12->vm_exit_msr_store_count)) 4202 nested_vmx_abort(vcpu, 4203 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4204 } 4205 4206 /* 4207 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 4208 * preserved above and would only end up incorrectly in L1. 4209 */ 4210 vcpu->arch.nmi_injected = false; 4211 kvm_clear_exception_queue(vcpu); 4212 kvm_clear_interrupt_queue(vcpu); 4213 } 4214 4215 /* 4216 * A part of what we need to when the nested L2 guest exits and we want to 4217 * run its L1 parent, is to reset L1's guest state to the host state specified 4218 * in vmcs12. 4219 * This function is to be called not only on normal nested exit, but also on 4220 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4221 * Failures During or After Loading Guest State"). 4222 * This function should be called when the active VMCS is L1's (vmcs01). 4223 */ 4224 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4225 struct vmcs12 *vmcs12) 4226 { 4227 enum vm_entry_failure_code ignored; 4228 struct kvm_segment seg; 4229 4230 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4231 vcpu->arch.efer = vmcs12->host_ia32_efer; 4232 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4233 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4234 else 4235 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4236 vmx_set_efer(vcpu, vcpu->arch.efer); 4237 4238 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4239 kvm_rip_write(vcpu, vmcs12->host_rip); 4240 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4241 vmx_set_interrupt_shadow(vcpu, 0); 4242 4243 /* 4244 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4245 * actually changed, because vmx_set_cr0 refers to efer set above. 4246 * 4247 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4248 * (KVM doesn't change it); 4249 */ 4250 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4251 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4252 4253 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4254 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4255 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4256 4257 nested_ept_uninit_mmu_context(vcpu); 4258 4259 /* 4260 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4261 * couldn't have changed. 4262 */ 4263 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4264 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4265 4266 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4267 4268 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4269 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4270 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4271 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4272 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4273 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4274 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4275 4276 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4277 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4278 vmcs_write64(GUEST_BNDCFGS, 0); 4279 4280 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4281 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4282 vcpu->arch.pat = vmcs12->host_ia32_pat; 4283 } 4284 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 4285 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4286 vmcs12->host_ia32_perf_global_ctrl)); 4287 4288 /* Set L1 segment info according to Intel SDM 4289 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4290 seg = (struct kvm_segment) { 4291 .base = 0, 4292 .limit = 0xFFFFFFFF, 4293 .selector = vmcs12->host_cs_selector, 4294 .type = 11, 4295 .present = 1, 4296 .s = 1, 4297 .g = 1 4298 }; 4299 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4300 seg.l = 1; 4301 else 4302 seg.db = 1; 4303 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4304 seg = (struct kvm_segment) { 4305 .base = 0, 4306 .limit = 0xFFFFFFFF, 4307 .type = 3, 4308 .present = 1, 4309 .s = 1, 4310 .db = 1, 4311 .g = 1 4312 }; 4313 seg.selector = vmcs12->host_ds_selector; 4314 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4315 seg.selector = vmcs12->host_es_selector; 4316 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4317 seg.selector = vmcs12->host_ss_selector; 4318 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4319 seg.selector = vmcs12->host_fs_selector; 4320 seg.base = vmcs12->host_fs_base; 4321 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4322 seg.selector = vmcs12->host_gs_selector; 4323 seg.base = vmcs12->host_gs_base; 4324 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4325 seg = (struct kvm_segment) { 4326 .base = vmcs12->host_tr_base, 4327 .limit = 0x67, 4328 .selector = vmcs12->host_tr_selector, 4329 .type = 11, 4330 .present = 1 4331 }; 4332 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4333 4334 memset(&seg, 0, sizeof(seg)); 4335 seg.unusable = 1; 4336 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4337 4338 kvm_set_dr(vcpu, 7, 0x400); 4339 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4340 4341 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4342 vmcs12->vm_exit_msr_load_count)) 4343 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4344 4345 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4346 } 4347 4348 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4349 { 4350 struct vmx_uret_msr *efer_msr; 4351 unsigned int i; 4352 4353 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4354 return vmcs_read64(GUEST_IA32_EFER); 4355 4356 if (cpu_has_load_ia32_efer()) 4357 return host_efer; 4358 4359 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4360 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4361 return vmx->msr_autoload.guest.val[i].value; 4362 } 4363 4364 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4365 if (efer_msr) 4366 return efer_msr->data; 4367 4368 return host_efer; 4369 } 4370 4371 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4372 { 4373 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4374 struct vcpu_vmx *vmx = to_vmx(vcpu); 4375 struct vmx_msr_entry g, h; 4376 gpa_t gpa; 4377 u32 i, j; 4378 4379 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4380 4381 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4382 /* 4383 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4384 * as vmcs01.GUEST_DR7 contains a userspace defined value 4385 * and vcpu->arch.dr7 is not squirreled away before the 4386 * nested VMENTER (not worth adding a variable in nested_vmx). 4387 */ 4388 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4389 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4390 else 4391 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4392 } 4393 4394 /* 4395 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4396 * handle a variety of side effects to KVM's software model. 4397 */ 4398 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4399 4400 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4401 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4402 4403 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4404 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4405 4406 nested_ept_uninit_mmu_context(vcpu); 4407 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4408 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4409 4410 /* 4411 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4412 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4413 * VMFail, like everything else we just need to ensure our 4414 * software model is up-to-date. 4415 */ 4416 if (enable_ept && is_pae_paging(vcpu)) 4417 ept_save_pdptrs(vcpu); 4418 4419 kvm_mmu_reset_context(vcpu); 4420 4421 /* 4422 * This nasty bit of open coding is a compromise between blindly 4423 * loading L1's MSRs using the exit load lists (incorrect emulation 4424 * of VMFail), leaving the nested VM's MSRs in the software model 4425 * (incorrect behavior) and snapshotting the modified MSRs (too 4426 * expensive since the lists are unbound by hardware). For each 4427 * MSR that was (prematurely) loaded from the nested VMEntry load 4428 * list, reload it from the exit load list if it exists and differs 4429 * from the guest value. The intent is to stuff host state as 4430 * silently as possible, not to fully process the exit load list. 4431 */ 4432 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4433 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4434 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4435 pr_debug_ratelimited( 4436 "%s read MSR index failed (%u, 0x%08llx)\n", 4437 __func__, i, gpa); 4438 goto vmabort; 4439 } 4440 4441 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4442 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4443 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4444 pr_debug_ratelimited( 4445 "%s read MSR failed (%u, 0x%08llx)\n", 4446 __func__, j, gpa); 4447 goto vmabort; 4448 } 4449 if (h.index != g.index) 4450 continue; 4451 if (h.value == g.value) 4452 break; 4453 4454 if (nested_vmx_load_msr_check(vcpu, &h)) { 4455 pr_debug_ratelimited( 4456 "%s check failed (%u, 0x%x, 0x%x)\n", 4457 __func__, j, h.index, h.reserved); 4458 goto vmabort; 4459 } 4460 4461 if (kvm_set_msr(vcpu, h.index, h.value)) { 4462 pr_debug_ratelimited( 4463 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4464 __func__, j, h.index, h.value); 4465 goto vmabort; 4466 } 4467 } 4468 } 4469 4470 return; 4471 4472 vmabort: 4473 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4474 } 4475 4476 /* 4477 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4478 * and modify vmcs12 to make it see what it would expect to see there if 4479 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4480 */ 4481 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4482 u32 exit_intr_info, unsigned long exit_qualification) 4483 { 4484 struct vcpu_vmx *vmx = to_vmx(vcpu); 4485 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4486 4487 /* trying to cancel vmlaunch/vmresume is a bug */ 4488 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4489 4490 /* Similarly, triple faults in L2 should never escape. */ 4491 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); 4492 4493 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4494 /* 4495 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4496 * Enlightened VMCS after migration and we still need to 4497 * do that when something is forcing L2->L1 exit prior to 4498 * the first L2 run. 4499 */ 4500 (void)nested_get_evmcs_page(vcpu); 4501 } 4502 4503 /* Service pending TLB flush requests for L2 before switching to L1. */ 4504 kvm_service_local_tlb_flush_requests(vcpu); 4505 4506 /* 4507 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4508 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4509 * up-to-date before switching to L1. 4510 */ 4511 if (enable_ept && is_pae_paging(vcpu)) 4512 vmx_ept_load_pdptrs(vcpu); 4513 4514 leave_guest_mode(vcpu); 4515 4516 if (nested_cpu_has_preemption_timer(vmcs12)) 4517 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4518 4519 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4520 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4521 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4522 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4523 } 4524 4525 if (likely(!vmx->fail)) { 4526 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4527 4528 if (vm_exit_reason != -1) 4529 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4530 exit_intr_info, exit_qualification); 4531 4532 /* 4533 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4534 * also be used to capture vmcs12 cache as part of 4535 * capturing nVMX state for snapshot (migration). 4536 * 4537 * Otherwise, this flush will dirty guest memory at a 4538 * point it is already assumed by user-space to be 4539 * immutable. 4540 */ 4541 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4542 } else { 4543 /* 4544 * The only expected VM-instruction error is "VM entry with 4545 * invalid control field(s)." Anything else indicates a 4546 * problem with L0. And we should never get here with a 4547 * VMFail of any type if early consistency checks are enabled. 4548 */ 4549 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4550 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4551 WARN_ON_ONCE(nested_early_check); 4552 } 4553 4554 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4555 4556 /* Update any VMCS fields that might have changed while L2 ran */ 4557 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4558 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4559 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4560 if (kvm_has_tsc_control) 4561 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4562 4563 if (vmx->nested.l1_tpr_threshold != -1) 4564 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4565 4566 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4567 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4568 vmx_set_virtual_apic_mode(vcpu); 4569 } 4570 4571 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4572 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4573 vmx_update_cpu_dirty_logging(vcpu); 4574 } 4575 4576 /* Unpin physical memory we referred to in vmcs02 */ 4577 if (vmx->nested.apic_access_page) { 4578 kvm_release_page_clean(vmx->nested.apic_access_page); 4579 vmx->nested.apic_access_page = NULL; 4580 } 4581 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4582 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4583 vmx->nested.pi_desc = NULL; 4584 4585 if (vmx->nested.reload_vmcs01_apic_access_page) { 4586 vmx->nested.reload_vmcs01_apic_access_page = false; 4587 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4588 } 4589 4590 if ((vm_exit_reason != -1) && 4591 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) 4592 vmx->nested.need_vmcs12_to_shadow_sync = true; 4593 4594 /* in case we halted in L2 */ 4595 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4596 4597 if (likely(!vmx->fail)) { 4598 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4599 nested_exit_intr_ack_set(vcpu)) { 4600 int irq = kvm_cpu_get_interrupt(vcpu); 4601 WARN_ON(irq < 0); 4602 vmcs12->vm_exit_intr_info = irq | 4603 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4604 } 4605 4606 if (vm_exit_reason != -1) 4607 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4608 vmcs12->exit_qualification, 4609 vmcs12->idt_vectoring_info_field, 4610 vmcs12->vm_exit_intr_info, 4611 vmcs12->vm_exit_intr_error_code, 4612 KVM_ISA_VMX); 4613 4614 load_vmcs12_host_state(vcpu, vmcs12); 4615 4616 return; 4617 } 4618 4619 /* 4620 * After an early L2 VM-entry failure, we're now back 4621 * in L1 which thinks it just finished a VMLAUNCH or 4622 * VMRESUME instruction, so we need to set the failure 4623 * flag and the VM-instruction error field of the VMCS 4624 * accordingly, and skip the emulated instruction. 4625 */ 4626 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4627 4628 /* 4629 * Restore L1's host state to KVM's software model. We're here 4630 * because a consistency check was caught by hardware, which 4631 * means some amount of guest state has been propagated to KVM's 4632 * model and needs to be unwound to the host's state. 4633 */ 4634 nested_vmx_restore_host_state(vcpu); 4635 4636 vmx->fail = 0; 4637 } 4638 4639 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4640 { 4641 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4642 } 4643 4644 /* 4645 * Decode the memory-address operand of a vmx instruction, as recorded on an 4646 * exit caused by such an instruction (run by a guest hypervisor). 4647 * On success, returns 0. When the operand is invalid, returns 1 and throws 4648 * #UD, #GP, or #SS. 4649 */ 4650 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4651 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4652 { 4653 gva_t off; 4654 bool exn; 4655 struct kvm_segment s; 4656 4657 /* 4658 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4659 * Execution", on an exit, vmx_instruction_info holds most of the 4660 * addressing components of the operand. Only the displacement part 4661 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4662 * For how an actual address is calculated from all these components, 4663 * refer to Vol. 1, "Operand Addressing". 4664 */ 4665 int scaling = vmx_instruction_info & 3; 4666 int addr_size = (vmx_instruction_info >> 7) & 7; 4667 bool is_reg = vmx_instruction_info & (1u << 10); 4668 int seg_reg = (vmx_instruction_info >> 15) & 7; 4669 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4670 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4671 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4672 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4673 4674 if (is_reg) { 4675 kvm_queue_exception(vcpu, UD_VECTOR); 4676 return 1; 4677 } 4678 4679 /* Addr = segment_base + offset */ 4680 /* offset = base + [index * scale] + displacement */ 4681 off = exit_qualification; /* holds the displacement */ 4682 if (addr_size == 1) 4683 off = (gva_t)sign_extend64(off, 31); 4684 else if (addr_size == 0) 4685 off = (gva_t)sign_extend64(off, 15); 4686 if (base_is_valid) 4687 off += kvm_register_read(vcpu, base_reg); 4688 if (index_is_valid) 4689 off += kvm_register_read(vcpu, index_reg) << scaling; 4690 vmx_get_segment(vcpu, &s, seg_reg); 4691 4692 /* 4693 * The effective address, i.e. @off, of a memory operand is truncated 4694 * based on the address size of the instruction. Note that this is 4695 * the *effective address*, i.e. the address prior to accounting for 4696 * the segment's base. 4697 */ 4698 if (addr_size == 1) /* 32 bit */ 4699 off &= 0xffffffff; 4700 else if (addr_size == 0) /* 16 bit */ 4701 off &= 0xffff; 4702 4703 /* Checks for #GP/#SS exceptions. */ 4704 exn = false; 4705 if (is_long_mode(vcpu)) { 4706 /* 4707 * The virtual/linear address is never truncated in 64-bit 4708 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4709 * address when using FS/GS with a non-zero base. 4710 */ 4711 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4712 *ret = s.base + off; 4713 else 4714 *ret = off; 4715 4716 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4717 * non-canonical form. This is the only check on the memory 4718 * destination for long mode! 4719 */ 4720 exn = is_noncanonical_address(*ret, vcpu); 4721 } else { 4722 /* 4723 * When not in long mode, the virtual/linear address is 4724 * unconditionally truncated to 32 bits regardless of the 4725 * address size. 4726 */ 4727 *ret = (s.base + off) & 0xffffffff; 4728 4729 /* Protected mode: apply checks for segment validity in the 4730 * following order: 4731 * - segment type check (#GP(0) may be thrown) 4732 * - usability check (#GP(0)/#SS(0)) 4733 * - limit check (#GP(0)/#SS(0)) 4734 */ 4735 if (wr) 4736 /* #GP(0) if the destination operand is located in a 4737 * read-only data segment or any code segment. 4738 */ 4739 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4740 else 4741 /* #GP(0) if the source operand is located in an 4742 * execute-only code segment 4743 */ 4744 exn = ((s.type & 0xa) == 8); 4745 if (exn) { 4746 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4747 return 1; 4748 } 4749 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4750 */ 4751 exn = (s.unusable != 0); 4752 4753 /* 4754 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4755 * outside the segment limit. All CPUs that support VMX ignore 4756 * limit checks for flat segments, i.e. segments with base==0, 4757 * limit==0xffffffff and of type expand-up data or code. 4758 */ 4759 if (!(s.base == 0 && s.limit == 0xffffffff && 4760 ((s.type & 8) || !(s.type & 4)))) 4761 exn = exn || ((u64)off + len - 1 > s.limit); 4762 } 4763 if (exn) { 4764 kvm_queue_exception_e(vcpu, 4765 seg_reg == VCPU_SREG_SS ? 4766 SS_VECTOR : GP_VECTOR, 4767 0); 4768 return 1; 4769 } 4770 4771 return 0; 4772 } 4773 4774 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 4775 { 4776 struct vcpu_vmx *vmx; 4777 4778 if (!nested_vmx_allowed(vcpu)) 4779 return; 4780 4781 vmx = to_vmx(vcpu); 4782 if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { 4783 vmx->nested.msrs.entry_ctls_high |= 4784 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4785 vmx->nested.msrs.exit_ctls_high |= 4786 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4787 } else { 4788 vmx->nested.msrs.entry_ctls_high &= 4789 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4790 vmx->nested.msrs.exit_ctls_high &= 4791 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4792 } 4793 } 4794 4795 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 4796 int *ret) 4797 { 4798 gva_t gva; 4799 struct x86_exception e; 4800 int r; 4801 4802 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 4803 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4804 sizeof(*vmpointer), &gva)) { 4805 *ret = 1; 4806 return -EINVAL; 4807 } 4808 4809 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 4810 if (r != X86EMUL_CONTINUE) { 4811 *ret = kvm_handle_memory_failure(vcpu, r, &e); 4812 return -EINVAL; 4813 } 4814 4815 return 0; 4816 } 4817 4818 /* 4819 * Allocate a shadow VMCS and associate it with the currently loaded 4820 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4821 * VMCS is also VMCLEARed, so that it is ready for use. 4822 */ 4823 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4824 { 4825 struct vcpu_vmx *vmx = to_vmx(vcpu); 4826 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4827 4828 /* 4829 * We should allocate a shadow vmcs for vmcs01 only when L1 4830 * executes VMXON and free it when L1 executes VMXOFF. 4831 * As it is invalid to execute VMXON twice, we shouldn't reach 4832 * here when vmcs01 already have an allocated shadow vmcs. 4833 */ 4834 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4835 4836 if (!loaded_vmcs->shadow_vmcs) { 4837 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4838 if (loaded_vmcs->shadow_vmcs) 4839 vmcs_clear(loaded_vmcs->shadow_vmcs); 4840 } 4841 return loaded_vmcs->shadow_vmcs; 4842 } 4843 4844 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4845 { 4846 struct vcpu_vmx *vmx = to_vmx(vcpu); 4847 int r; 4848 4849 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4850 if (r < 0) 4851 goto out_vmcs02; 4852 4853 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4854 if (!vmx->nested.cached_vmcs12) 4855 goto out_cached_vmcs12; 4856 4857 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 4858 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4859 if (!vmx->nested.cached_shadow_vmcs12) 4860 goto out_cached_shadow_vmcs12; 4861 4862 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4863 goto out_shadow_vmcs; 4864 4865 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4866 HRTIMER_MODE_ABS_PINNED); 4867 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4868 4869 vmx->nested.vpid02 = allocate_vpid(); 4870 4871 vmx->nested.vmcs02_initialized = false; 4872 vmx->nested.vmxon = true; 4873 4874 if (vmx_pt_mode_is_host_guest()) { 4875 vmx->pt_desc.guest.ctl = 0; 4876 pt_update_intercept_for_msr(vcpu); 4877 } 4878 4879 return 0; 4880 4881 out_shadow_vmcs: 4882 kfree(vmx->nested.cached_shadow_vmcs12); 4883 4884 out_cached_shadow_vmcs12: 4885 kfree(vmx->nested.cached_vmcs12); 4886 4887 out_cached_vmcs12: 4888 free_loaded_vmcs(&vmx->nested.vmcs02); 4889 4890 out_vmcs02: 4891 return -ENOMEM; 4892 } 4893 4894 /* Emulate the VMXON instruction. */ 4895 static int handle_vmon(struct kvm_vcpu *vcpu) 4896 { 4897 int ret; 4898 gpa_t vmptr; 4899 uint32_t revision; 4900 struct vcpu_vmx *vmx = to_vmx(vcpu); 4901 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 4902 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 4903 4904 /* 4905 * The Intel VMX Instruction Reference lists a bunch of bits that are 4906 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4907 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). 4908 * Otherwise, we should fail with #UD. But most faulting conditions 4909 * have already been checked by hardware, prior to the VM-exit for 4910 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4911 * that bit set to 1 in non-root mode. 4912 */ 4913 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4914 kvm_queue_exception(vcpu, UD_VECTOR); 4915 return 1; 4916 } 4917 4918 /* CPL=0 must be checked manually. */ 4919 if (vmx_get_cpl(vcpu)) { 4920 kvm_inject_gp(vcpu, 0); 4921 return 1; 4922 } 4923 4924 if (vmx->nested.vmxon) 4925 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4926 4927 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4928 != VMXON_NEEDED_FEATURES) { 4929 kvm_inject_gp(vcpu, 0); 4930 return 1; 4931 } 4932 4933 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 4934 return ret; 4935 4936 /* 4937 * SDM 3: 24.11.5 4938 * The first 4 bytes of VMXON region contain the supported 4939 * VMCS revision identifier 4940 * 4941 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4942 * which replaces physical address width with 32 4943 */ 4944 if (!page_address_valid(vcpu, vmptr)) 4945 return nested_vmx_failInvalid(vcpu); 4946 4947 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4948 revision != VMCS12_REVISION) 4949 return nested_vmx_failInvalid(vcpu); 4950 4951 vmx->nested.vmxon_ptr = vmptr; 4952 ret = enter_vmx_operation(vcpu); 4953 if (ret) 4954 return ret; 4955 4956 return nested_vmx_succeed(vcpu); 4957 } 4958 4959 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4960 { 4961 struct vcpu_vmx *vmx = to_vmx(vcpu); 4962 4963 if (vmx->nested.current_vmptr == INVALID_GPA) 4964 return; 4965 4966 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4967 4968 if (enable_shadow_vmcs) { 4969 /* copy to memory all shadowed fields in case 4970 they were modified */ 4971 copy_shadow_to_vmcs12(vmx); 4972 vmx_disable_shadow_vmcs(vmx); 4973 } 4974 vmx->nested.posted_intr_nv = -1; 4975 4976 /* Flush VMCS12 to guest memory */ 4977 kvm_vcpu_write_guest_page(vcpu, 4978 vmx->nested.current_vmptr >> PAGE_SHIFT, 4979 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4980 4981 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4982 4983 vmx->nested.current_vmptr = INVALID_GPA; 4984 } 4985 4986 /* Emulate the VMXOFF instruction */ 4987 static int handle_vmoff(struct kvm_vcpu *vcpu) 4988 { 4989 if (!nested_vmx_check_permission(vcpu)) 4990 return 1; 4991 4992 free_nested(vcpu); 4993 4994 /* Process a latched INIT during time CPU was in VMX operation */ 4995 kvm_make_request(KVM_REQ_EVENT, vcpu); 4996 4997 return nested_vmx_succeed(vcpu); 4998 } 4999 5000 /* Emulate the VMCLEAR instruction */ 5001 static int handle_vmclear(struct kvm_vcpu *vcpu) 5002 { 5003 struct vcpu_vmx *vmx = to_vmx(vcpu); 5004 u32 zero = 0; 5005 gpa_t vmptr; 5006 u64 evmcs_gpa; 5007 int r; 5008 5009 if (!nested_vmx_check_permission(vcpu)) 5010 return 1; 5011 5012 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5013 return r; 5014 5015 if (!page_address_valid(vcpu, vmptr)) 5016 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5017 5018 if (vmptr == vmx->nested.vmxon_ptr) 5019 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5020 5021 /* 5022 * When Enlightened VMEntry is enabled on the calling CPU we treat 5023 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 5024 * way to distinguish it from VMCS12) and we must not corrupt it by 5025 * writing to the non-existent 'launch_state' field. The area doesn't 5026 * have to be the currently active EVMCS on the calling CPU and there's 5027 * nothing KVM has to do to transition it from 'active' to 'non-active' 5028 * state. It is possible that the area will stay mapped as 5029 * vmx->nested.hv_evmcs but this shouldn't be a problem. 5030 */ 5031 if (likely(!vmx->nested.enlightened_vmcs_enabled || 5032 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 5033 if (vmptr == vmx->nested.current_vmptr) 5034 nested_release_vmcs12(vcpu); 5035 5036 kvm_vcpu_write_guest(vcpu, 5037 vmptr + offsetof(struct vmcs12, 5038 launch_state), 5039 &zero, sizeof(zero)); 5040 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { 5041 nested_release_evmcs(vcpu); 5042 } 5043 5044 return nested_vmx_succeed(vcpu); 5045 } 5046 5047 /* Emulate the VMLAUNCH instruction */ 5048 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5049 { 5050 return nested_vmx_run(vcpu, true); 5051 } 5052 5053 /* Emulate the VMRESUME instruction */ 5054 static int handle_vmresume(struct kvm_vcpu *vcpu) 5055 { 5056 5057 return nested_vmx_run(vcpu, false); 5058 } 5059 5060 static int handle_vmread(struct kvm_vcpu *vcpu) 5061 { 5062 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5063 : get_vmcs12(vcpu); 5064 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5065 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5066 struct vcpu_vmx *vmx = to_vmx(vcpu); 5067 struct x86_exception e; 5068 unsigned long field; 5069 u64 value; 5070 gva_t gva = 0; 5071 short offset; 5072 int len, r; 5073 5074 if (!nested_vmx_check_permission(vcpu)) 5075 return 1; 5076 5077 /* 5078 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5079 * any VMREAD sets the ALU flags for VMfailInvalid. 5080 */ 5081 if (vmx->nested.current_vmptr == INVALID_GPA || 5082 (is_guest_mode(vcpu) && 5083 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5084 return nested_vmx_failInvalid(vcpu); 5085 5086 /* Decode instruction info and find the field to read */ 5087 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5088 5089 offset = vmcs_field_to_offset(field); 5090 if (offset < 0) 5091 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5092 5093 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5094 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5095 5096 /* Read the field, zero-extended to a u64 value */ 5097 value = vmcs12_read_any(vmcs12, field, offset); 5098 5099 /* 5100 * Now copy part of this value to register or memory, as requested. 5101 * Note that the number of bits actually copied is 32 or 64 depending 5102 * on the guest's mode (32 or 64 bit), not on the given field's length. 5103 */ 5104 if (instr_info & BIT(10)) { 5105 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5106 } else { 5107 len = is_64_bit_mode(vcpu) ? 8 : 4; 5108 if (get_vmx_mem_address(vcpu, exit_qualification, 5109 instr_info, true, len, &gva)) 5110 return 1; 5111 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5112 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5113 if (r != X86EMUL_CONTINUE) 5114 return kvm_handle_memory_failure(vcpu, r, &e); 5115 } 5116 5117 return nested_vmx_succeed(vcpu); 5118 } 5119 5120 static bool is_shadow_field_rw(unsigned long field) 5121 { 5122 switch (field) { 5123 #define SHADOW_FIELD_RW(x, y) case x: 5124 #include "vmcs_shadow_fields.h" 5125 return true; 5126 default: 5127 break; 5128 } 5129 return false; 5130 } 5131 5132 static bool is_shadow_field_ro(unsigned long field) 5133 { 5134 switch (field) { 5135 #define SHADOW_FIELD_RO(x, y) case x: 5136 #include "vmcs_shadow_fields.h" 5137 return true; 5138 default: 5139 break; 5140 } 5141 return false; 5142 } 5143 5144 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5145 { 5146 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5147 : get_vmcs12(vcpu); 5148 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5149 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5150 struct vcpu_vmx *vmx = to_vmx(vcpu); 5151 struct x86_exception e; 5152 unsigned long field; 5153 short offset; 5154 gva_t gva; 5155 int len, r; 5156 5157 /* 5158 * The value to write might be 32 or 64 bits, depending on L1's long 5159 * mode, and eventually we need to write that into a field of several 5160 * possible lengths. The code below first zero-extends the value to 64 5161 * bit (value), and then copies only the appropriate number of 5162 * bits into the vmcs12 field. 5163 */ 5164 u64 value = 0; 5165 5166 if (!nested_vmx_check_permission(vcpu)) 5167 return 1; 5168 5169 /* 5170 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5171 * any VMWRITE sets the ALU flags for VMfailInvalid. 5172 */ 5173 if (vmx->nested.current_vmptr == INVALID_GPA || 5174 (is_guest_mode(vcpu) && 5175 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5176 return nested_vmx_failInvalid(vcpu); 5177 5178 if (instr_info & BIT(10)) 5179 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5180 else { 5181 len = is_64_bit_mode(vcpu) ? 8 : 4; 5182 if (get_vmx_mem_address(vcpu, exit_qualification, 5183 instr_info, false, len, &gva)) 5184 return 1; 5185 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5186 if (r != X86EMUL_CONTINUE) 5187 return kvm_handle_memory_failure(vcpu, r, &e); 5188 } 5189 5190 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5191 5192 offset = vmcs_field_to_offset(field); 5193 if (offset < 0) 5194 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5195 5196 /* 5197 * If the vCPU supports "VMWRITE to any supported field in the 5198 * VMCS," then the "read-only" fields are actually read/write. 5199 */ 5200 if (vmcs_field_readonly(field) && 5201 !nested_cpu_has_vmwrite_any_field(vcpu)) 5202 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5203 5204 /* 5205 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5206 * vmcs12, else we may crush a field or consume a stale value. 5207 */ 5208 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5209 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5210 5211 /* 5212 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5213 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5214 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5215 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5216 * from L1 will return a different value than VMREAD from L2 (L1 sees 5217 * the stripped down value, L2 sees the full value as stored by KVM). 5218 */ 5219 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5220 value &= 0x1f0ff; 5221 5222 vmcs12_write_any(vmcs12, field, offset, value); 5223 5224 /* 5225 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5226 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5227 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5228 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5229 */ 5230 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5231 /* 5232 * L1 can read these fields without exiting, ensure the 5233 * shadow VMCS is up-to-date. 5234 */ 5235 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5236 preempt_disable(); 5237 vmcs_load(vmx->vmcs01.shadow_vmcs); 5238 5239 __vmcs_writel(field, value); 5240 5241 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5242 vmcs_load(vmx->loaded_vmcs->vmcs); 5243 preempt_enable(); 5244 } 5245 vmx->nested.dirty_vmcs12 = true; 5246 } 5247 5248 return nested_vmx_succeed(vcpu); 5249 } 5250 5251 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5252 { 5253 vmx->nested.current_vmptr = vmptr; 5254 if (enable_shadow_vmcs) { 5255 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5256 vmcs_write64(VMCS_LINK_POINTER, 5257 __pa(vmx->vmcs01.shadow_vmcs)); 5258 vmx->nested.need_vmcs12_to_shadow_sync = true; 5259 } 5260 vmx->nested.dirty_vmcs12 = true; 5261 } 5262 5263 /* Emulate the VMPTRLD instruction */ 5264 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5265 { 5266 struct vcpu_vmx *vmx = to_vmx(vcpu); 5267 gpa_t vmptr; 5268 int r; 5269 5270 if (!nested_vmx_check_permission(vcpu)) 5271 return 1; 5272 5273 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5274 return r; 5275 5276 if (!page_address_valid(vcpu, vmptr)) 5277 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5278 5279 if (vmptr == vmx->nested.vmxon_ptr) 5280 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5281 5282 /* Forbid normal VMPTRLD if Enlightened version was used */ 5283 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 5284 return 1; 5285 5286 if (vmx->nested.current_vmptr != vmptr) { 5287 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5288 struct vmcs_hdr hdr; 5289 5290 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5291 /* 5292 * Reads from an unbacked page return all 1s, 5293 * which means that the 32 bits located at the 5294 * given physical address won't match the required 5295 * VMCS12_REVISION identifier. 5296 */ 5297 return nested_vmx_fail(vcpu, 5298 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5299 } 5300 5301 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5302 offsetof(struct vmcs12, hdr), 5303 sizeof(hdr))) { 5304 return nested_vmx_fail(vcpu, 5305 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5306 } 5307 5308 if (hdr.revision_id != VMCS12_REVISION || 5309 (hdr.shadow_vmcs && 5310 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5311 return nested_vmx_fail(vcpu, 5312 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5313 } 5314 5315 nested_release_vmcs12(vcpu); 5316 5317 /* 5318 * Load VMCS12 from guest memory since it is not already 5319 * cached. 5320 */ 5321 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5322 VMCS12_SIZE)) { 5323 return nested_vmx_fail(vcpu, 5324 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5325 } 5326 5327 set_current_vmptr(vmx, vmptr); 5328 } 5329 5330 return nested_vmx_succeed(vcpu); 5331 } 5332 5333 /* Emulate the VMPTRST instruction */ 5334 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5335 { 5336 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5337 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5338 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5339 struct x86_exception e; 5340 gva_t gva; 5341 int r; 5342 5343 if (!nested_vmx_check_permission(vcpu)) 5344 return 1; 5345 5346 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) 5347 return 1; 5348 5349 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5350 true, sizeof(gpa_t), &gva)) 5351 return 1; 5352 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5353 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5354 sizeof(gpa_t), &e); 5355 if (r != X86EMUL_CONTINUE) 5356 return kvm_handle_memory_failure(vcpu, r, &e); 5357 5358 return nested_vmx_succeed(vcpu); 5359 } 5360 5361 /* Emulate the INVEPT instruction */ 5362 static int handle_invept(struct kvm_vcpu *vcpu) 5363 { 5364 struct vcpu_vmx *vmx = to_vmx(vcpu); 5365 u32 vmx_instruction_info, types; 5366 unsigned long type, roots_to_free; 5367 struct kvm_mmu *mmu; 5368 gva_t gva; 5369 struct x86_exception e; 5370 struct { 5371 u64 eptp, gpa; 5372 } operand; 5373 int i, r, gpr_index; 5374 5375 if (!(vmx->nested.msrs.secondary_ctls_high & 5376 SECONDARY_EXEC_ENABLE_EPT) || 5377 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5378 kvm_queue_exception(vcpu, UD_VECTOR); 5379 return 1; 5380 } 5381 5382 if (!nested_vmx_check_permission(vcpu)) 5383 return 1; 5384 5385 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5386 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5387 type = kvm_register_read(vcpu, gpr_index); 5388 5389 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5390 5391 if (type >= 32 || !(types & (1 << type))) 5392 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5393 5394 /* According to the Intel VMX instruction reference, the memory 5395 * operand is read even if it isn't needed (e.g., for type==global) 5396 */ 5397 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5398 vmx_instruction_info, false, sizeof(operand), &gva)) 5399 return 1; 5400 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5401 if (r != X86EMUL_CONTINUE) 5402 return kvm_handle_memory_failure(vcpu, r, &e); 5403 5404 /* 5405 * Nested EPT roots are always held through guest_mmu, 5406 * not root_mmu. 5407 */ 5408 mmu = &vcpu->arch.guest_mmu; 5409 5410 switch (type) { 5411 case VMX_EPT_EXTENT_CONTEXT: 5412 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5413 return nested_vmx_fail(vcpu, 5414 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5415 5416 roots_to_free = 0; 5417 if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, 5418 operand.eptp)) 5419 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5420 5421 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5422 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5423 mmu->prev_roots[i].pgd, 5424 operand.eptp)) 5425 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5426 } 5427 break; 5428 case VMX_EPT_EXTENT_GLOBAL: 5429 roots_to_free = KVM_MMU_ROOTS_ALL; 5430 break; 5431 default: 5432 BUG(); 5433 break; 5434 } 5435 5436 if (roots_to_free) 5437 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 5438 5439 return nested_vmx_succeed(vcpu); 5440 } 5441 5442 static int handle_invvpid(struct kvm_vcpu *vcpu) 5443 { 5444 struct vcpu_vmx *vmx = to_vmx(vcpu); 5445 u32 vmx_instruction_info; 5446 unsigned long type, types; 5447 gva_t gva; 5448 struct x86_exception e; 5449 struct { 5450 u64 vpid; 5451 u64 gla; 5452 } operand; 5453 u16 vpid02; 5454 int r, gpr_index; 5455 5456 if (!(vmx->nested.msrs.secondary_ctls_high & 5457 SECONDARY_EXEC_ENABLE_VPID) || 5458 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5459 kvm_queue_exception(vcpu, UD_VECTOR); 5460 return 1; 5461 } 5462 5463 if (!nested_vmx_check_permission(vcpu)) 5464 return 1; 5465 5466 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5467 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5468 type = kvm_register_read(vcpu, gpr_index); 5469 5470 types = (vmx->nested.msrs.vpid_caps & 5471 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5472 5473 if (type >= 32 || !(types & (1 << type))) 5474 return nested_vmx_fail(vcpu, 5475 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5476 5477 /* according to the intel vmx instruction reference, the memory 5478 * operand is read even if it isn't needed (e.g., for type==global) 5479 */ 5480 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5481 vmx_instruction_info, false, sizeof(operand), &gva)) 5482 return 1; 5483 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5484 if (r != X86EMUL_CONTINUE) 5485 return kvm_handle_memory_failure(vcpu, r, &e); 5486 5487 if (operand.vpid >> 16) 5488 return nested_vmx_fail(vcpu, 5489 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5490 5491 vpid02 = nested_get_vpid02(vcpu); 5492 switch (type) { 5493 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5494 if (!operand.vpid || 5495 is_noncanonical_address(operand.gla, vcpu)) 5496 return nested_vmx_fail(vcpu, 5497 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5498 vpid_sync_vcpu_addr(vpid02, operand.gla); 5499 break; 5500 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5501 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5502 if (!operand.vpid) 5503 return nested_vmx_fail(vcpu, 5504 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5505 vpid_sync_context(vpid02); 5506 break; 5507 case VMX_VPID_EXTENT_ALL_CONTEXT: 5508 vpid_sync_context(vpid02); 5509 break; 5510 default: 5511 WARN_ON_ONCE(1); 5512 return kvm_skip_emulated_instruction(vcpu); 5513 } 5514 5515 /* 5516 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5517 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5518 * roots as VPIDs are not tracked in the MMU role. 5519 * 5520 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5521 * an MMU when EPT is disabled. 5522 * 5523 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5524 */ 5525 if (!enable_ept) 5526 kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu); 5527 5528 return nested_vmx_succeed(vcpu); 5529 } 5530 5531 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5532 struct vmcs12 *vmcs12) 5533 { 5534 u32 index = kvm_rcx_read(vcpu); 5535 u64 new_eptp; 5536 5537 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5538 return 1; 5539 if (index >= VMFUNC_EPTP_ENTRIES) 5540 return 1; 5541 5542 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5543 &new_eptp, index * 8, 8)) 5544 return 1; 5545 5546 /* 5547 * If the (L2) guest does a vmfunc to the currently 5548 * active ept pointer, we don't have to do anything else 5549 */ 5550 if (vmcs12->ept_pointer != new_eptp) { 5551 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5552 return 1; 5553 5554 vmcs12->ept_pointer = new_eptp; 5555 nested_ept_new_eptp(vcpu); 5556 5557 if (!nested_cpu_has_vpid(vmcs12)) 5558 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5559 } 5560 5561 return 0; 5562 } 5563 5564 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5565 { 5566 struct vcpu_vmx *vmx = to_vmx(vcpu); 5567 struct vmcs12 *vmcs12; 5568 u32 function = kvm_rax_read(vcpu); 5569 5570 /* 5571 * VMFUNC is only supported for nested guests, but we always enable the 5572 * secondary control for simplicity; for non-nested mode, fake that we 5573 * didn't by injecting #UD. 5574 */ 5575 if (!is_guest_mode(vcpu)) { 5576 kvm_queue_exception(vcpu, UD_VECTOR); 5577 return 1; 5578 } 5579 5580 vmcs12 = get_vmcs12(vcpu); 5581 5582 /* 5583 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5584 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5585 */ 5586 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5587 kvm_queue_exception(vcpu, UD_VECTOR); 5588 return 1; 5589 } 5590 5591 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5592 goto fail; 5593 5594 switch (function) { 5595 case 0: 5596 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5597 goto fail; 5598 break; 5599 default: 5600 goto fail; 5601 } 5602 return kvm_skip_emulated_instruction(vcpu); 5603 5604 fail: 5605 /* 5606 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5607 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5608 * EXIT_REASON_VMFUNC as the exit reason. 5609 */ 5610 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5611 vmx_get_intr_info(vcpu), 5612 vmx_get_exit_qual(vcpu)); 5613 return 1; 5614 } 5615 5616 /* 5617 * Return true if an IO instruction with the specified port and size should cause 5618 * a VM-exit into L1. 5619 */ 5620 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5621 int size) 5622 { 5623 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5624 gpa_t bitmap, last_bitmap; 5625 u8 b; 5626 5627 last_bitmap = INVALID_GPA; 5628 b = -1; 5629 5630 while (size > 0) { 5631 if (port < 0x8000) 5632 bitmap = vmcs12->io_bitmap_a; 5633 else if (port < 0x10000) 5634 bitmap = vmcs12->io_bitmap_b; 5635 else 5636 return true; 5637 bitmap += (port & 0x7fff) / 8; 5638 5639 if (last_bitmap != bitmap) 5640 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5641 return true; 5642 if (b & (1 << (port & 7))) 5643 return true; 5644 5645 port++; 5646 size--; 5647 last_bitmap = bitmap; 5648 } 5649 5650 return false; 5651 } 5652 5653 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5654 struct vmcs12 *vmcs12) 5655 { 5656 unsigned long exit_qualification; 5657 unsigned short port; 5658 int size; 5659 5660 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5661 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5662 5663 exit_qualification = vmx_get_exit_qual(vcpu); 5664 5665 port = exit_qualification >> 16; 5666 size = (exit_qualification & 7) + 1; 5667 5668 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5669 } 5670 5671 /* 5672 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5673 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5674 * disinterest in the current event (read or write a specific MSR) by using an 5675 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5676 */ 5677 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5678 struct vmcs12 *vmcs12, 5679 union vmx_exit_reason exit_reason) 5680 { 5681 u32 msr_index = kvm_rcx_read(vcpu); 5682 gpa_t bitmap; 5683 5684 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5685 return true; 5686 5687 /* 5688 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5689 * for the four combinations of read/write and low/high MSR numbers. 5690 * First we need to figure out which of the four to use: 5691 */ 5692 bitmap = vmcs12->msr_bitmap; 5693 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5694 bitmap += 2048; 5695 if (msr_index >= 0xc0000000) { 5696 msr_index -= 0xc0000000; 5697 bitmap += 1024; 5698 } 5699 5700 /* Then read the msr_index'th bit from this bitmap: */ 5701 if (msr_index < 1024*8) { 5702 unsigned char b; 5703 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5704 return true; 5705 return 1 & (b >> (msr_index & 7)); 5706 } else 5707 return true; /* let L1 handle the wrong parameter */ 5708 } 5709 5710 /* 5711 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5712 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5713 * intercept (via guest_host_mask etc.) the current event. 5714 */ 5715 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5716 struct vmcs12 *vmcs12) 5717 { 5718 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5719 int cr = exit_qualification & 15; 5720 int reg; 5721 unsigned long val; 5722 5723 switch ((exit_qualification >> 4) & 3) { 5724 case 0: /* mov to cr */ 5725 reg = (exit_qualification >> 8) & 15; 5726 val = kvm_register_read(vcpu, reg); 5727 switch (cr) { 5728 case 0: 5729 if (vmcs12->cr0_guest_host_mask & 5730 (val ^ vmcs12->cr0_read_shadow)) 5731 return true; 5732 break; 5733 case 3: 5734 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5735 return true; 5736 break; 5737 case 4: 5738 if (vmcs12->cr4_guest_host_mask & 5739 (vmcs12->cr4_read_shadow ^ val)) 5740 return true; 5741 break; 5742 case 8: 5743 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5744 return true; 5745 break; 5746 } 5747 break; 5748 case 2: /* clts */ 5749 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5750 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5751 return true; 5752 break; 5753 case 1: /* mov from cr */ 5754 switch (cr) { 5755 case 3: 5756 if (vmcs12->cpu_based_vm_exec_control & 5757 CPU_BASED_CR3_STORE_EXITING) 5758 return true; 5759 break; 5760 case 8: 5761 if (vmcs12->cpu_based_vm_exec_control & 5762 CPU_BASED_CR8_STORE_EXITING) 5763 return true; 5764 break; 5765 } 5766 break; 5767 case 3: /* lmsw */ 5768 /* 5769 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5770 * cr0. Other attempted changes are ignored, with no exit. 5771 */ 5772 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5773 if (vmcs12->cr0_guest_host_mask & 0xe & 5774 (val ^ vmcs12->cr0_read_shadow)) 5775 return true; 5776 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5777 !(vmcs12->cr0_read_shadow & 0x1) && 5778 (val & 0x1)) 5779 return true; 5780 break; 5781 } 5782 return false; 5783 } 5784 5785 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 5786 struct vmcs12 *vmcs12) 5787 { 5788 u32 encls_leaf; 5789 5790 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 5791 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 5792 return false; 5793 5794 encls_leaf = kvm_rax_read(vcpu); 5795 if (encls_leaf > 62) 5796 encls_leaf = 63; 5797 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 5798 } 5799 5800 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5801 struct vmcs12 *vmcs12, gpa_t bitmap) 5802 { 5803 u32 vmx_instruction_info; 5804 unsigned long field; 5805 u8 b; 5806 5807 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5808 return true; 5809 5810 /* Decode instruction info and find the field to access */ 5811 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5812 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5813 5814 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5815 if (field >> 15) 5816 return true; 5817 5818 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5819 return true; 5820 5821 return 1 & (b >> (field & 7)); 5822 } 5823 5824 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 5825 { 5826 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 5827 5828 if (nested_cpu_has_mtf(vmcs12)) 5829 return true; 5830 5831 /* 5832 * An MTF VM-exit may be injected into the guest by setting the 5833 * interruption-type to 7 (other event) and the vector field to 0. Such 5834 * is the case regardless of the 'monitor trap flag' VM-execution 5835 * control. 5836 */ 5837 return entry_intr_info == (INTR_INFO_VALID_MASK 5838 | INTR_TYPE_OTHER_EVENT); 5839 } 5840 5841 /* 5842 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 5843 * L1 wants the exit. Only call this when in is_guest_mode (L2). 5844 */ 5845 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 5846 union vmx_exit_reason exit_reason) 5847 { 5848 u32 intr_info; 5849 5850 switch ((u16)exit_reason.basic) { 5851 case EXIT_REASON_EXCEPTION_NMI: 5852 intr_info = vmx_get_intr_info(vcpu); 5853 if (is_nmi(intr_info)) 5854 return true; 5855 else if (is_page_fault(intr_info)) 5856 return vcpu->arch.apf.host_apf_flags || 5857 vmx_need_pf_intercept(vcpu); 5858 else if (is_debug(intr_info) && 5859 vcpu->guest_debug & 5860 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5861 return true; 5862 else if (is_breakpoint(intr_info) && 5863 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5864 return true; 5865 else if (is_alignment_check(intr_info) && 5866 !vmx_guest_inject_ac(vcpu)) 5867 return true; 5868 return false; 5869 case EXIT_REASON_EXTERNAL_INTERRUPT: 5870 return true; 5871 case EXIT_REASON_MCE_DURING_VMENTRY: 5872 return true; 5873 case EXIT_REASON_EPT_VIOLATION: 5874 /* 5875 * L0 always deals with the EPT violation. If nested EPT is 5876 * used, and the nested mmu code discovers that the address is 5877 * missing in the guest EPT table (EPT12), the EPT violation 5878 * will be injected with nested_ept_inject_page_fault() 5879 */ 5880 return true; 5881 case EXIT_REASON_EPT_MISCONFIG: 5882 /* 5883 * L2 never uses directly L1's EPT, but rather L0's own EPT 5884 * table (shadow on EPT) or a merged EPT table that L0 built 5885 * (EPT on EPT). So any problems with the structure of the 5886 * table is L0's fault. 5887 */ 5888 return true; 5889 case EXIT_REASON_PREEMPTION_TIMER: 5890 return true; 5891 case EXIT_REASON_PML_FULL: 5892 /* 5893 * PML is emulated for an L1 VMM and should never be enabled in 5894 * vmcs02, always "handle" PML_FULL by exiting to userspace. 5895 */ 5896 return true; 5897 case EXIT_REASON_VMFUNC: 5898 /* VM functions are emulated through L2->L0 vmexits. */ 5899 return true; 5900 case EXIT_REASON_BUS_LOCK: 5901 /* 5902 * At present, bus lock VM exit is never exposed to L1. 5903 * Handle L2's bus locks in L0 directly. 5904 */ 5905 return true; 5906 default: 5907 break; 5908 } 5909 return false; 5910 } 5911 5912 /* 5913 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 5914 * is_guest_mode (L2). 5915 */ 5916 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 5917 union vmx_exit_reason exit_reason) 5918 { 5919 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5920 u32 intr_info; 5921 5922 switch ((u16)exit_reason.basic) { 5923 case EXIT_REASON_EXCEPTION_NMI: 5924 intr_info = vmx_get_intr_info(vcpu); 5925 if (is_nmi(intr_info)) 5926 return true; 5927 else if (is_page_fault(intr_info)) 5928 return true; 5929 return vmcs12->exception_bitmap & 5930 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5931 case EXIT_REASON_EXTERNAL_INTERRUPT: 5932 return nested_exit_on_intr(vcpu); 5933 case EXIT_REASON_TRIPLE_FAULT: 5934 return true; 5935 case EXIT_REASON_INTERRUPT_WINDOW: 5936 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 5937 case EXIT_REASON_NMI_WINDOW: 5938 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 5939 case EXIT_REASON_TASK_SWITCH: 5940 return true; 5941 case EXIT_REASON_CPUID: 5942 return true; 5943 case EXIT_REASON_HLT: 5944 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5945 case EXIT_REASON_INVD: 5946 return true; 5947 case EXIT_REASON_INVLPG: 5948 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5949 case EXIT_REASON_RDPMC: 5950 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5951 case EXIT_REASON_RDRAND: 5952 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5953 case EXIT_REASON_RDSEED: 5954 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5955 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5956 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5957 case EXIT_REASON_VMREAD: 5958 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5959 vmcs12->vmread_bitmap); 5960 case EXIT_REASON_VMWRITE: 5961 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5962 vmcs12->vmwrite_bitmap); 5963 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5964 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5965 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5966 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5967 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5968 /* 5969 * VMX instructions trap unconditionally. This allows L1 to 5970 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5971 */ 5972 return true; 5973 case EXIT_REASON_CR_ACCESS: 5974 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5975 case EXIT_REASON_DR_ACCESS: 5976 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5977 case EXIT_REASON_IO_INSTRUCTION: 5978 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5979 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5980 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5981 case EXIT_REASON_MSR_READ: 5982 case EXIT_REASON_MSR_WRITE: 5983 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5984 case EXIT_REASON_INVALID_STATE: 5985 return true; 5986 case EXIT_REASON_MWAIT_INSTRUCTION: 5987 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5988 case EXIT_REASON_MONITOR_TRAP_FLAG: 5989 return nested_vmx_exit_handled_mtf(vmcs12); 5990 case EXIT_REASON_MONITOR_INSTRUCTION: 5991 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5992 case EXIT_REASON_PAUSE_INSTRUCTION: 5993 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5994 nested_cpu_has2(vmcs12, 5995 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5996 case EXIT_REASON_MCE_DURING_VMENTRY: 5997 return true; 5998 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5999 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6000 case EXIT_REASON_APIC_ACCESS: 6001 case EXIT_REASON_APIC_WRITE: 6002 case EXIT_REASON_EOI_INDUCED: 6003 /* 6004 * The controls for "virtualize APIC accesses," "APIC- 6005 * register virtualization," and "virtual-interrupt 6006 * delivery" only come from vmcs12. 6007 */ 6008 return true; 6009 case EXIT_REASON_INVPCID: 6010 return 6011 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6012 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6013 case EXIT_REASON_WBINVD: 6014 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6015 case EXIT_REASON_XSETBV: 6016 return true; 6017 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6018 /* 6019 * This should never happen, since it is not possible to 6020 * set XSS to a non-zero value---neither in L1 nor in L2. 6021 * If if it were, XSS would have to be checked against 6022 * the XSS exit bitmap in vmcs12. 6023 */ 6024 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 6025 case EXIT_REASON_UMWAIT: 6026 case EXIT_REASON_TPAUSE: 6027 return nested_cpu_has2(vmcs12, 6028 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6029 case EXIT_REASON_ENCLS: 6030 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6031 default: 6032 return true; 6033 } 6034 } 6035 6036 /* 6037 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6038 * reflected into L1. 6039 */ 6040 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6041 { 6042 struct vcpu_vmx *vmx = to_vmx(vcpu); 6043 union vmx_exit_reason exit_reason = vmx->exit_reason; 6044 unsigned long exit_qual; 6045 u32 exit_intr_info; 6046 6047 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6048 6049 /* 6050 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6051 * has already loaded L2's state. 6052 */ 6053 if (unlikely(vmx->fail)) { 6054 trace_kvm_nested_vmenter_failed( 6055 "hardware VM-instruction error: ", 6056 vmcs_read32(VM_INSTRUCTION_ERROR)); 6057 exit_intr_info = 0; 6058 exit_qual = 0; 6059 goto reflect_vmexit; 6060 } 6061 6062 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6063 6064 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6065 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6066 return false; 6067 6068 /* If L1 doesn't want the exit, handle it in L0. */ 6069 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6070 return false; 6071 6072 /* 6073 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6074 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6075 * need to be synthesized by querying the in-kernel LAPIC, but external 6076 * interrupts are never reflected to L1 so it's a non-issue. 6077 */ 6078 exit_intr_info = vmx_get_intr_info(vcpu); 6079 if (is_exception_with_error_code(exit_intr_info)) { 6080 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6081 6082 vmcs12->vm_exit_intr_error_code = 6083 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6084 } 6085 exit_qual = vmx_get_exit_qual(vcpu); 6086 6087 reflect_vmexit: 6088 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6089 return true; 6090 } 6091 6092 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6093 struct kvm_nested_state __user *user_kvm_nested_state, 6094 u32 user_data_size) 6095 { 6096 struct vcpu_vmx *vmx; 6097 struct vmcs12 *vmcs12; 6098 struct kvm_nested_state kvm_state = { 6099 .flags = 0, 6100 .format = KVM_STATE_NESTED_FORMAT_VMX, 6101 .size = sizeof(kvm_state), 6102 .hdr.vmx.flags = 0, 6103 .hdr.vmx.vmxon_pa = INVALID_GPA, 6104 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6105 .hdr.vmx.preemption_timer_deadline = 0, 6106 }; 6107 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6108 &user_kvm_nested_state->data.vmx[0]; 6109 6110 if (!vcpu) 6111 return kvm_state.size + sizeof(*user_vmx_nested_state); 6112 6113 vmx = to_vmx(vcpu); 6114 vmcs12 = get_vmcs12(vcpu); 6115 6116 if (nested_vmx_allowed(vcpu) && 6117 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6118 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6119 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6120 6121 if (vmx_has_valid_vmcs12(vcpu)) { 6122 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6123 6124 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6125 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 6126 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6127 6128 if (is_guest_mode(vcpu) && 6129 nested_cpu_has_shadow_vmcs(vmcs12) && 6130 vmcs12->vmcs_link_pointer != INVALID_GPA) 6131 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6132 } 6133 6134 if (vmx->nested.smm.vmxon) 6135 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6136 6137 if (vmx->nested.smm.guest_mode) 6138 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6139 6140 if (is_guest_mode(vcpu)) { 6141 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6142 6143 if (vmx->nested.nested_run_pending) 6144 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6145 6146 if (vmx->nested.mtf_pending) 6147 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6148 6149 if (nested_cpu_has_preemption_timer(vmcs12) && 6150 vmx->nested.has_preemption_timer_deadline) { 6151 kvm_state.hdr.vmx.flags |= 6152 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6153 kvm_state.hdr.vmx.preemption_timer_deadline = 6154 vmx->nested.preemption_timer_deadline; 6155 } 6156 } 6157 } 6158 6159 if (user_data_size < kvm_state.size) 6160 goto out; 6161 6162 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6163 return -EFAULT; 6164 6165 if (!vmx_has_valid_vmcs12(vcpu)) 6166 goto out; 6167 6168 /* 6169 * When running L2, the authoritative vmcs12 state is in the 6170 * vmcs02. When running L1, the authoritative vmcs12 state is 6171 * in the shadow or enlightened vmcs linked to vmcs01, unless 6172 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6173 * vmcs12 state is in the vmcs12 already. 6174 */ 6175 if (is_guest_mode(vcpu)) { 6176 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6177 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6178 } else { 6179 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6180 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6181 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 6182 /* 6183 * L1 hypervisor is not obliged to keep eVMCS 6184 * clean fields data always up-to-date while 6185 * not in guest mode, 'hv_clean_fields' is only 6186 * supposed to be actual upon vmentry so we need 6187 * to ignore it here and do full copy. 6188 */ 6189 copy_enlightened_to_vmcs12(vmx, 0); 6190 else if (enable_shadow_vmcs) 6191 copy_shadow_to_vmcs12(vmx); 6192 } 6193 } 6194 6195 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6196 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6197 6198 /* 6199 * Copy over the full allocated size of vmcs12 rather than just the size 6200 * of the struct. 6201 */ 6202 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6203 return -EFAULT; 6204 6205 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6206 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6207 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6208 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6209 return -EFAULT; 6210 } 6211 out: 6212 return kvm_state.size; 6213 } 6214 6215 /* 6216 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 6217 */ 6218 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6219 { 6220 if (is_guest_mode(vcpu)) { 6221 to_vmx(vcpu)->nested.nested_run_pending = 0; 6222 nested_vmx_vmexit(vcpu, -1, 0, 0); 6223 } 6224 free_nested(vcpu); 6225 } 6226 6227 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6228 struct kvm_nested_state __user *user_kvm_nested_state, 6229 struct kvm_nested_state *kvm_state) 6230 { 6231 struct vcpu_vmx *vmx = to_vmx(vcpu); 6232 struct vmcs12 *vmcs12; 6233 enum vm_entry_failure_code ignored; 6234 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6235 &user_kvm_nested_state->data.vmx[0]; 6236 int ret; 6237 6238 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6239 return -EINVAL; 6240 6241 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6242 if (kvm_state->hdr.vmx.smm.flags) 6243 return -EINVAL; 6244 6245 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6246 return -EINVAL; 6247 6248 /* 6249 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6250 * enable eVMCS capability on vCPU. However, since then 6251 * code was changed such that flag signals vmcs12 should 6252 * be copied into eVMCS in guest memory. 6253 * 6254 * To preserve backwards compatability, allow user 6255 * to set this flag even when there is no VMXON region. 6256 */ 6257 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6258 return -EINVAL; 6259 } else { 6260 if (!nested_vmx_allowed(vcpu)) 6261 return -EINVAL; 6262 6263 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6264 return -EINVAL; 6265 } 6266 6267 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6268 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6269 return -EINVAL; 6270 6271 if (kvm_state->hdr.vmx.smm.flags & 6272 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6273 return -EINVAL; 6274 6275 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6276 return -EINVAL; 6277 6278 /* 6279 * SMM temporarily disables VMX, so we cannot be in guest mode, 6280 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6281 * must be zero. 6282 */ 6283 if (is_smm(vcpu) ? 6284 (kvm_state->flags & 6285 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6286 : kvm_state->hdr.vmx.smm.flags) 6287 return -EINVAL; 6288 6289 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6290 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6291 return -EINVAL; 6292 6293 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6294 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6295 return -EINVAL; 6296 6297 vmx_leave_nested(vcpu); 6298 6299 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6300 return 0; 6301 6302 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6303 ret = enter_vmx_operation(vcpu); 6304 if (ret) 6305 return ret; 6306 6307 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6308 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6309 /* See vmx_has_valid_vmcs12. */ 6310 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6311 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6312 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6313 return -EINVAL; 6314 else 6315 return 0; 6316 } 6317 6318 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6319 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6320 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6321 return -EINVAL; 6322 6323 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6324 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6325 /* 6326 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6327 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6328 * restored yet. EVMCS will be mapped from 6329 * nested_get_vmcs12_pages(). 6330 */ 6331 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6332 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6333 } else { 6334 return -EINVAL; 6335 } 6336 6337 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6338 vmx->nested.smm.vmxon = true; 6339 vmx->nested.vmxon = false; 6340 6341 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6342 vmx->nested.smm.guest_mode = true; 6343 } 6344 6345 vmcs12 = get_vmcs12(vcpu); 6346 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6347 return -EFAULT; 6348 6349 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6350 return -EINVAL; 6351 6352 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6353 return 0; 6354 6355 vmx->nested.nested_run_pending = 6356 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6357 6358 vmx->nested.mtf_pending = 6359 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6360 6361 ret = -EINVAL; 6362 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6363 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6364 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6365 6366 if (kvm_state->size < 6367 sizeof(*kvm_state) + 6368 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6369 goto error_guest_mode; 6370 6371 if (copy_from_user(shadow_vmcs12, 6372 user_vmx_nested_state->shadow_vmcs12, 6373 sizeof(*shadow_vmcs12))) { 6374 ret = -EFAULT; 6375 goto error_guest_mode; 6376 } 6377 6378 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6379 !shadow_vmcs12->hdr.shadow_vmcs) 6380 goto error_guest_mode; 6381 } 6382 6383 vmx->nested.has_preemption_timer_deadline = false; 6384 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6385 vmx->nested.has_preemption_timer_deadline = true; 6386 vmx->nested.preemption_timer_deadline = 6387 kvm_state->hdr.vmx.preemption_timer_deadline; 6388 } 6389 6390 if (nested_vmx_check_controls(vcpu, vmcs12) || 6391 nested_vmx_check_host_state(vcpu, vmcs12) || 6392 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6393 goto error_guest_mode; 6394 6395 vmx->nested.dirty_vmcs12 = true; 6396 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6397 if (ret) 6398 goto error_guest_mode; 6399 6400 return 0; 6401 6402 error_guest_mode: 6403 vmx->nested.nested_run_pending = 0; 6404 return ret; 6405 } 6406 6407 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6408 { 6409 if (enable_shadow_vmcs) { 6410 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6411 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6412 } 6413 } 6414 6415 /* 6416 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6417 * that madness to get the encoding for comparison. 6418 */ 6419 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6420 6421 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6422 { 6423 /* 6424 * Note these are the so called "index" of the VMCS field encoding, not 6425 * the index into vmcs12. 6426 */ 6427 unsigned int max_idx, idx; 6428 int i; 6429 6430 /* 6431 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6432 * vmcs12, regardless of whether or not the associated feature is 6433 * exposed to L1. Simply find the field with the highest index. 6434 */ 6435 max_idx = 0; 6436 for (i = 0; i < nr_vmcs12_fields; i++) { 6437 /* The vmcs12 table is very, very sparsely populated. */ 6438 if (!vmcs_field_to_offset_table[i]) 6439 continue; 6440 6441 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6442 if (idx > max_idx) 6443 max_idx = idx; 6444 } 6445 6446 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6447 } 6448 6449 /* 6450 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6451 * returned for the various VMX controls MSRs when nested VMX is enabled. 6452 * The same values should also be used to verify that vmcs12 control fields are 6453 * valid during nested entry from L1 to L2. 6454 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6455 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6456 * bit in the high half is on if the corresponding bit in the control field 6457 * may be on. See also vmx_control_verify(). 6458 */ 6459 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) 6460 { 6461 /* 6462 * Note that as a general rule, the high half of the MSRs (bits in 6463 * the control fields which may be 1) should be initialized by the 6464 * intersection of the underlying hardware's MSR (i.e., features which 6465 * can be supported) and the list of features we want to expose - 6466 * because they are known to be properly supported in our code. 6467 * Also, usually, the low half of the MSRs (bits which must be 1) can 6468 * be set to 0, meaning that L1 may turn off any of these bits. The 6469 * reason is that if one of these bits is necessary, it will appear 6470 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6471 * fields of vmcs01 and vmcs02, will turn these bits off - and 6472 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6473 * These rules have exceptions below. 6474 */ 6475 6476 /* pin-based controls */ 6477 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 6478 msrs->pinbased_ctls_low, 6479 msrs->pinbased_ctls_high); 6480 msrs->pinbased_ctls_low |= 6481 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6482 msrs->pinbased_ctls_high &= 6483 PIN_BASED_EXT_INTR_MASK | 6484 PIN_BASED_NMI_EXITING | 6485 PIN_BASED_VIRTUAL_NMIS | 6486 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6487 msrs->pinbased_ctls_high |= 6488 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6489 PIN_BASED_VMX_PREEMPTION_TIMER; 6490 6491 /* exit controls */ 6492 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 6493 msrs->exit_ctls_low, 6494 msrs->exit_ctls_high); 6495 msrs->exit_ctls_low = 6496 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6497 6498 msrs->exit_ctls_high &= 6499 #ifdef CONFIG_X86_64 6500 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6501 #endif 6502 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6503 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6504 msrs->exit_ctls_high |= 6505 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6506 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6507 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 6508 6509 /* We support free control of debug control saving. */ 6510 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6511 6512 /* entry controls */ 6513 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 6514 msrs->entry_ctls_low, 6515 msrs->entry_ctls_high); 6516 msrs->entry_ctls_low = 6517 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6518 msrs->entry_ctls_high &= 6519 #ifdef CONFIG_X86_64 6520 VM_ENTRY_IA32E_MODE | 6521 #endif 6522 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 6523 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 6524 msrs->entry_ctls_high |= 6525 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6526 6527 /* We support free control of debug control loading. */ 6528 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6529 6530 /* cpu-based controls */ 6531 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6532 msrs->procbased_ctls_low, 6533 msrs->procbased_ctls_high); 6534 msrs->procbased_ctls_low = 6535 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6536 msrs->procbased_ctls_high &= 6537 CPU_BASED_INTR_WINDOW_EXITING | 6538 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6539 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6540 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6541 CPU_BASED_CR3_STORE_EXITING | 6542 #ifdef CONFIG_X86_64 6543 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6544 #endif 6545 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6546 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6547 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6548 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6549 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6550 /* 6551 * We can allow some features even when not supported by the 6552 * hardware. For example, L1 can specify an MSR bitmap - and we 6553 * can use it to avoid exits to L1 - even when L0 runs L2 6554 * without MSR bitmaps. 6555 */ 6556 msrs->procbased_ctls_high |= 6557 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6558 CPU_BASED_USE_MSR_BITMAPS; 6559 6560 /* We support free control of CR3 access interception. */ 6561 msrs->procbased_ctls_low &= 6562 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6563 6564 /* 6565 * secondary cpu-based controls. Do not include those that 6566 * depend on CPUID bits, they are added later by 6567 * vmx_vcpu_after_set_cpuid. 6568 */ 6569 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6570 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6571 msrs->secondary_ctls_low, 6572 msrs->secondary_ctls_high); 6573 6574 msrs->secondary_ctls_low = 0; 6575 msrs->secondary_ctls_high &= 6576 SECONDARY_EXEC_DESC | 6577 SECONDARY_EXEC_ENABLE_RDTSCP | 6578 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6579 SECONDARY_EXEC_WBINVD_EXITING | 6580 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6581 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6582 SECONDARY_EXEC_RDRAND_EXITING | 6583 SECONDARY_EXEC_ENABLE_INVPCID | 6584 SECONDARY_EXEC_RDSEED_EXITING | 6585 SECONDARY_EXEC_XSAVES | 6586 SECONDARY_EXEC_TSC_SCALING; 6587 6588 /* 6589 * We can emulate "VMCS shadowing," even if the hardware 6590 * doesn't support it. 6591 */ 6592 msrs->secondary_ctls_high |= 6593 SECONDARY_EXEC_SHADOW_VMCS; 6594 6595 if (enable_ept) { 6596 /* nested EPT: emulate EPT also to L1 */ 6597 msrs->secondary_ctls_high |= 6598 SECONDARY_EXEC_ENABLE_EPT; 6599 msrs->ept_caps = 6600 VMX_EPT_PAGE_WALK_4_BIT | 6601 VMX_EPT_PAGE_WALK_5_BIT | 6602 VMX_EPTP_WB_BIT | 6603 VMX_EPT_INVEPT_BIT | 6604 VMX_EPT_EXECUTE_ONLY_BIT; 6605 6606 msrs->ept_caps &= ept_caps; 6607 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6608 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6609 VMX_EPT_1GB_PAGE_BIT; 6610 if (enable_ept_ad_bits) { 6611 msrs->secondary_ctls_high |= 6612 SECONDARY_EXEC_ENABLE_PML; 6613 msrs->ept_caps |= VMX_EPT_AD_BIT; 6614 } 6615 } 6616 6617 if (cpu_has_vmx_vmfunc()) { 6618 msrs->secondary_ctls_high |= 6619 SECONDARY_EXEC_ENABLE_VMFUNC; 6620 /* 6621 * Advertise EPTP switching unconditionally 6622 * since we emulate it 6623 */ 6624 if (enable_ept) 6625 msrs->vmfunc_controls = 6626 VMX_VMFUNC_EPTP_SWITCHING; 6627 } 6628 6629 /* 6630 * Old versions of KVM use the single-context version without 6631 * checking for support, so declare that it is supported even 6632 * though it is treated as global context. The alternative is 6633 * not failing the single-context invvpid, and it is worse. 6634 */ 6635 if (enable_vpid) { 6636 msrs->secondary_ctls_high |= 6637 SECONDARY_EXEC_ENABLE_VPID; 6638 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6639 VMX_VPID_EXTENT_SUPPORTED_MASK; 6640 } 6641 6642 if (enable_unrestricted_guest) 6643 msrs->secondary_ctls_high |= 6644 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6645 6646 if (flexpriority_enabled) 6647 msrs->secondary_ctls_high |= 6648 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6649 6650 if (enable_sgx) 6651 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6652 6653 /* miscellaneous data */ 6654 rdmsr(MSR_IA32_VMX_MISC, 6655 msrs->misc_low, 6656 msrs->misc_high); 6657 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6658 msrs->misc_low |= 6659 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6660 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6661 VMX_MISC_ACTIVITY_HLT | 6662 VMX_MISC_ACTIVITY_WAIT_SIPI; 6663 msrs->misc_high = 0; 6664 6665 /* 6666 * This MSR reports some information about VMX support. We 6667 * should return information about the VMX we emulate for the 6668 * guest, and the VMCS structure we give it - not about the 6669 * VMX support of the underlying hardware. 6670 */ 6671 msrs->basic = 6672 VMCS12_REVISION | 6673 VMX_BASIC_TRUE_CTLS | 6674 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6675 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6676 6677 if (cpu_has_vmx_basic_inout()) 6678 msrs->basic |= VMX_BASIC_INOUT; 6679 6680 /* 6681 * These MSRs specify bits which the guest must keep fixed on 6682 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6683 * We picked the standard core2 setting. 6684 */ 6685 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6686 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6687 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6688 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6689 6690 /* These MSRs specify bits which the guest must keep fixed off. */ 6691 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6692 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6693 6694 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 6695 } 6696 6697 void nested_vmx_hardware_unsetup(void) 6698 { 6699 int i; 6700 6701 if (enable_shadow_vmcs) { 6702 for (i = 0; i < VMX_BITMAP_NR; i++) 6703 free_page((unsigned long)vmx_bitmap[i]); 6704 } 6705 } 6706 6707 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6708 { 6709 int i; 6710 6711 if (!cpu_has_vmx_shadow_vmcs()) 6712 enable_shadow_vmcs = 0; 6713 if (enable_shadow_vmcs) { 6714 for (i = 0; i < VMX_BITMAP_NR; i++) { 6715 /* 6716 * The vmx_bitmap is not tied to a VM and so should 6717 * not be charged to a memcg. 6718 */ 6719 vmx_bitmap[i] = (unsigned long *) 6720 __get_free_page(GFP_KERNEL); 6721 if (!vmx_bitmap[i]) { 6722 nested_vmx_hardware_unsetup(); 6723 return -ENOMEM; 6724 } 6725 } 6726 6727 init_vmcs_shadow_fields(); 6728 } 6729 6730 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6731 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6732 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6733 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6734 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6735 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6736 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6737 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; 6738 exit_handlers[EXIT_REASON_VMON] = handle_vmon; 6739 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6740 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6741 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6742 6743 return 0; 6744 } 6745 6746 struct kvm_x86_nested_ops vmx_nested_ops = { 6747 .check_events = vmx_check_nested_events, 6748 .hv_timer_pending = nested_vmx_preemption_timer_pending, 6749 .triple_fault = nested_vmx_triple_fault, 6750 .get_state = vmx_get_nested_state, 6751 .set_state = vmx_set_nested_state, 6752 .get_nested_state_pages = vmx_get_nested_state_pages, 6753 .write_log_dirty = nested_vmx_write_pml_buffer, 6754 .enable_evmcs = nested_enable_evmcs, 6755 .get_evmcs_version = nested_get_evmcs_version, 6756 }; 6757