1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/objtool.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "pmu.h" 14 #include "sgx.h" 15 #include "trace.h" 16 #include "vmx.h" 17 #include "x86.h" 18 19 static bool __read_mostly enable_shadow_vmcs = 1; 20 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 21 22 static bool __read_mostly nested_early_check = 0; 23 module_param(nested_early_check, bool, S_IRUGO); 24 25 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 26 27 /* 28 * Hyper-V requires all of these, so mark them as supported even though 29 * they are just treated the same as all-context. 30 */ 31 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 32 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 33 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 34 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 36 37 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 38 39 enum { 40 VMX_VMREAD_BITMAP, 41 VMX_VMWRITE_BITMAP, 42 VMX_BITMAP_NR 43 }; 44 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 45 46 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 47 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 48 49 struct shadow_vmcs_field { 50 u16 encoding; 51 u16 offset; 52 }; 53 static struct shadow_vmcs_field shadow_read_only_fields[] = { 54 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 55 #include "vmcs_shadow_fields.h" 56 }; 57 static int max_shadow_read_only_fields = 58 ARRAY_SIZE(shadow_read_only_fields); 59 60 static struct shadow_vmcs_field shadow_read_write_fields[] = { 61 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 62 #include "vmcs_shadow_fields.h" 63 }; 64 static int max_shadow_read_write_fields = 65 ARRAY_SIZE(shadow_read_write_fields); 66 67 static void init_vmcs_shadow_fields(void) 68 { 69 int i, j; 70 71 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 72 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 73 74 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 75 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 76 u16 field = entry.encoding; 77 78 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 79 (i + 1 == max_shadow_read_only_fields || 80 shadow_read_only_fields[i + 1].encoding != field + 1)) 81 pr_err("Missing field from shadow_read_only_field %x\n", 82 field + 1); 83 84 clear_bit(field, vmx_vmread_bitmap); 85 if (field & 1) 86 #ifdef CONFIG_X86_64 87 continue; 88 #else 89 entry.offset += sizeof(u32); 90 #endif 91 shadow_read_only_fields[j++] = entry; 92 } 93 max_shadow_read_only_fields = j; 94 95 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 96 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 97 u16 field = entry.encoding; 98 99 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 100 (i + 1 == max_shadow_read_write_fields || 101 shadow_read_write_fields[i + 1].encoding != field + 1)) 102 pr_err("Missing field from shadow_read_write_field %x\n", 103 field + 1); 104 105 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 106 field <= GUEST_TR_AR_BYTES, 107 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 108 109 /* 110 * PML and the preemption timer can be emulated, but the 111 * processor cannot vmwrite to fields that don't exist 112 * on bare metal. 113 */ 114 switch (field) { 115 case GUEST_PML_INDEX: 116 if (!cpu_has_vmx_pml()) 117 continue; 118 break; 119 case VMX_PREEMPTION_TIMER_VALUE: 120 if (!cpu_has_vmx_preemption_timer()) 121 continue; 122 break; 123 case GUEST_INTR_STATUS: 124 if (!cpu_has_vmx_apicv()) 125 continue; 126 break; 127 default: 128 break; 129 } 130 131 clear_bit(field, vmx_vmwrite_bitmap); 132 clear_bit(field, vmx_vmread_bitmap); 133 if (field & 1) 134 #ifdef CONFIG_X86_64 135 continue; 136 #else 137 entry.offset += sizeof(u32); 138 #endif 139 shadow_read_write_fields[j++] = entry; 140 } 141 max_shadow_read_write_fields = j; 142 } 143 144 /* 145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 146 * set the success or error code of an emulated VMX instruction (as specified 147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 148 * instruction. 149 */ 150 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 151 { 152 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 153 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 154 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 155 return kvm_skip_emulated_instruction(vcpu); 156 } 157 158 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 162 X86_EFLAGS_SF | X86_EFLAGS_OF)) 163 | X86_EFLAGS_CF); 164 return kvm_skip_emulated_instruction(vcpu); 165 } 166 167 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 168 u32 vm_instruction_error) 169 { 170 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 171 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 172 X86_EFLAGS_SF | X86_EFLAGS_OF)) 173 | X86_EFLAGS_ZF); 174 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 175 /* 176 * We don't need to force sync to shadow VMCS because 177 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 178 * fields and thus must be synced. 179 */ 180 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 181 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 182 183 return kvm_skip_emulated_instruction(vcpu); 184 } 185 186 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 187 { 188 struct vcpu_vmx *vmx = to_vmx(vcpu); 189 190 /* 191 * failValid writes the error number to the current VMCS, which 192 * can't be done if there isn't a current VMCS. 193 */ 194 if (vmx->nested.current_vmptr == INVALID_GPA && 195 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 196 return nested_vmx_failInvalid(vcpu); 197 198 return nested_vmx_failValid(vcpu, vm_instruction_error); 199 } 200 201 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 202 { 203 /* TODO: not to reset guest simply here. */ 204 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 205 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 206 } 207 208 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 209 { 210 return fixed_bits_valid(control, low, high); 211 } 212 213 static inline u64 vmx_control_msr(u32 low, u32 high) 214 { 215 return low | ((u64)high << 32); 216 } 217 218 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 219 { 220 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 221 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 222 vmx->nested.need_vmcs12_to_shadow_sync = false; 223 } 224 225 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 226 { 227 struct vcpu_vmx *vmx = to_vmx(vcpu); 228 229 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 230 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 231 vmx->nested.hv_evmcs = NULL; 232 } 233 234 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 235 } 236 237 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 238 struct loaded_vmcs *prev) 239 { 240 struct vmcs_host_state *dest, *src; 241 242 if (unlikely(!vmx->guest_state_loaded)) 243 return; 244 245 src = &prev->host_state; 246 dest = &vmx->loaded_vmcs->host_state; 247 248 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 249 dest->ldt_sel = src->ldt_sel; 250 #ifdef CONFIG_X86_64 251 dest->ds_sel = src->ds_sel; 252 dest->es_sel = src->es_sel; 253 #endif 254 } 255 256 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 257 { 258 struct vcpu_vmx *vmx = to_vmx(vcpu); 259 struct loaded_vmcs *prev; 260 int cpu; 261 262 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 263 return; 264 265 cpu = get_cpu(); 266 prev = vmx->loaded_vmcs; 267 vmx->loaded_vmcs = vmcs; 268 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 269 vmx_sync_vmcs_host_state(vmx, prev); 270 put_cpu(); 271 272 vmx_register_cache_reset(vcpu); 273 } 274 275 /* 276 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 277 * just stops using VMX. 278 */ 279 static void free_nested(struct kvm_vcpu *vcpu) 280 { 281 struct vcpu_vmx *vmx = to_vmx(vcpu); 282 283 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 284 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 285 286 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 287 return; 288 289 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 290 291 vmx->nested.vmxon = false; 292 vmx->nested.smm.vmxon = false; 293 vmx->nested.vmxon_ptr = INVALID_GPA; 294 free_vpid(vmx->nested.vpid02); 295 vmx->nested.posted_intr_nv = -1; 296 vmx->nested.current_vmptr = INVALID_GPA; 297 if (enable_shadow_vmcs) { 298 vmx_disable_shadow_vmcs(vmx); 299 vmcs_clear(vmx->vmcs01.shadow_vmcs); 300 free_vmcs(vmx->vmcs01.shadow_vmcs); 301 vmx->vmcs01.shadow_vmcs = NULL; 302 } 303 kfree(vmx->nested.cached_vmcs12); 304 vmx->nested.cached_vmcs12 = NULL; 305 kfree(vmx->nested.cached_shadow_vmcs12); 306 vmx->nested.cached_shadow_vmcs12 = NULL; 307 /* Unpin physical memory we referred to in the vmcs02 */ 308 if (vmx->nested.apic_access_page) { 309 kvm_release_page_clean(vmx->nested.apic_access_page); 310 vmx->nested.apic_access_page = NULL; 311 } 312 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 313 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 314 vmx->nested.pi_desc = NULL; 315 316 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 317 318 nested_release_evmcs(vcpu); 319 320 free_loaded_vmcs(&vmx->nested.vmcs02); 321 } 322 323 /* 324 * Ensure that the current vmcs of the logical processor is the 325 * vmcs01 of the vcpu before calling free_nested(). 326 */ 327 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 328 { 329 vcpu_load(vcpu); 330 vmx_leave_nested(vcpu); 331 vcpu_put(vcpu); 332 } 333 334 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 335 336 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 337 { 338 return VALID_PAGE(root_hpa) && 339 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 340 } 341 342 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 343 gpa_t addr) 344 { 345 uint i; 346 struct kvm_mmu_root_info *cached_root; 347 348 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 349 350 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 351 cached_root = &vcpu->arch.mmu->prev_roots[i]; 352 353 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 354 eptp)) 355 vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); 356 } 357 } 358 359 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 360 struct x86_exception *fault) 361 { 362 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 363 struct vcpu_vmx *vmx = to_vmx(vcpu); 364 u32 vm_exit_reason; 365 unsigned long exit_qualification = vcpu->arch.exit_qualification; 366 367 if (vmx->nested.pml_full) { 368 vm_exit_reason = EXIT_REASON_PML_FULL; 369 vmx->nested.pml_full = false; 370 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 371 } else { 372 if (fault->error_code & PFERR_RSVD_MASK) 373 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 374 else 375 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 376 377 /* 378 * Although the caller (kvm_inject_emulated_page_fault) would 379 * have already synced the faulting address in the shadow EPT 380 * tables for the current EPTP12, we also need to sync it for 381 * any other cached EPTP02s based on the same EP4TA, since the 382 * TLB associates mappings to the EP4TA rather than the full EPTP. 383 */ 384 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 385 fault->address); 386 } 387 388 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 389 vmcs12->guest_physical_address = fault->address; 390 } 391 392 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 393 { 394 kvm_init_shadow_ept_mmu(vcpu, 395 to_vmx(vcpu)->nested.msrs.ept_caps & 396 VMX_EPT_EXECUTE_ONLY_BIT, 397 nested_ept_ad_enabled(vcpu), 398 nested_ept_get_eptp(vcpu)); 399 } 400 401 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 402 { 403 WARN_ON(mmu_is_nested(vcpu)); 404 405 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 406 nested_ept_new_eptp(vcpu); 407 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 408 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 409 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 410 411 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 412 } 413 414 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 415 { 416 vcpu->arch.mmu = &vcpu->arch.root_mmu; 417 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 418 } 419 420 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 421 u16 error_code) 422 { 423 bool inequality, bit; 424 425 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 426 inequality = 427 (error_code & vmcs12->page_fault_error_code_mask) != 428 vmcs12->page_fault_error_code_match; 429 return inequality ^ bit; 430 } 431 432 433 /* 434 * KVM wants to inject page-faults which it got to the guest. This function 435 * checks whether in a nested guest, we need to inject them to L1 or L2. 436 */ 437 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 438 { 439 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 440 unsigned int nr = vcpu->arch.exception.nr; 441 bool has_payload = vcpu->arch.exception.has_payload; 442 unsigned long payload = vcpu->arch.exception.payload; 443 444 if (nr == PF_VECTOR) { 445 if (vcpu->arch.exception.nested_apf) { 446 *exit_qual = vcpu->arch.apf.nested_apf_token; 447 return 1; 448 } 449 if (nested_vmx_is_page_fault_vmexit(vmcs12, 450 vcpu->arch.exception.error_code)) { 451 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 452 return 1; 453 } 454 } else if (vmcs12->exception_bitmap & (1u << nr)) { 455 if (nr == DB_VECTOR) { 456 if (!has_payload) { 457 payload = vcpu->arch.dr6; 458 payload &= ~DR6_BT; 459 payload ^= DR6_ACTIVE_LOW; 460 } 461 *exit_qual = payload; 462 } else 463 *exit_qual = 0; 464 return 1; 465 } 466 467 return 0; 468 } 469 470 471 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 472 struct x86_exception *fault) 473 { 474 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 475 476 WARN_ON(!is_guest_mode(vcpu)); 477 478 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 479 !to_vmx(vcpu)->nested.nested_run_pending) { 480 vmcs12->vm_exit_intr_error_code = fault->error_code; 481 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 482 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 483 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 484 fault->address); 485 } else { 486 kvm_inject_page_fault(vcpu, fault); 487 } 488 } 489 490 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 491 struct vmcs12 *vmcs12) 492 { 493 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 494 return 0; 495 496 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 497 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 498 return -EINVAL; 499 500 return 0; 501 } 502 503 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 504 struct vmcs12 *vmcs12) 505 { 506 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 507 return 0; 508 509 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 510 return -EINVAL; 511 512 return 0; 513 } 514 515 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 516 struct vmcs12 *vmcs12) 517 { 518 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 519 return 0; 520 521 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 522 return -EINVAL; 523 524 return 0; 525 } 526 527 /* 528 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 529 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 530 * only the "disable intercept" case needs to be handled. 531 */ 532 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 533 unsigned long *msr_bitmap_l0, 534 u32 msr, int type) 535 { 536 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 537 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 538 539 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 540 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 541 } 542 543 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 544 { 545 int msr; 546 547 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 548 unsigned word = msr / BITS_PER_LONG; 549 550 msr_bitmap[word] = ~0; 551 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 552 } 553 } 554 555 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 556 static inline \ 557 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 558 unsigned long *msr_bitmap_l1, \ 559 unsigned long *msr_bitmap_l0, u32 msr) \ 560 { \ 561 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 562 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 563 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 564 else \ 565 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 566 } 567 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 568 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 569 570 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 571 unsigned long *msr_bitmap_l1, 572 unsigned long *msr_bitmap_l0, 573 u32 msr, int types) 574 { 575 if (types & MSR_TYPE_R) 576 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 577 msr_bitmap_l0, msr); 578 if (types & MSR_TYPE_W) 579 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 580 msr_bitmap_l0, msr); 581 } 582 583 /* 584 * Merge L0's and L1's MSR bitmap, return false to indicate that 585 * we do not use the hardware. 586 */ 587 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 588 struct vmcs12 *vmcs12) 589 { 590 struct vcpu_vmx *vmx = to_vmx(vcpu); 591 int msr; 592 unsigned long *msr_bitmap_l1; 593 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 594 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 595 596 /* Nothing to do if the MSR bitmap is not in use. */ 597 if (!cpu_has_vmx_msr_bitmap() || 598 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 599 return false; 600 601 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 602 return false; 603 604 msr_bitmap_l1 = (unsigned long *)map->hva; 605 606 /* 607 * To keep the control flow simple, pay eight 8-byte writes (sixteen 608 * 4-byte writes on 32-bit systems) up front to enable intercepts for 609 * the x2APIC MSR range and selectively toggle those relevant to L2. 610 */ 611 enable_x2apic_msr_intercepts(msr_bitmap_l0); 612 613 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 614 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 615 /* 616 * L0 need not intercept reads for MSRs between 0x800 617 * and 0x8ff, it just lets the processor take the value 618 * from the virtual-APIC page; take those 256 bits 619 * directly from the L1 bitmap. 620 */ 621 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 622 unsigned word = msr / BITS_PER_LONG; 623 624 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 625 } 626 } 627 628 nested_vmx_disable_intercept_for_x2apic_msr( 629 msr_bitmap_l1, msr_bitmap_l0, 630 X2APIC_MSR(APIC_TASKPRI), 631 MSR_TYPE_R | MSR_TYPE_W); 632 633 if (nested_cpu_has_vid(vmcs12)) { 634 nested_vmx_disable_intercept_for_x2apic_msr( 635 msr_bitmap_l1, msr_bitmap_l0, 636 X2APIC_MSR(APIC_EOI), 637 MSR_TYPE_W); 638 nested_vmx_disable_intercept_for_x2apic_msr( 639 msr_bitmap_l1, msr_bitmap_l0, 640 X2APIC_MSR(APIC_SELF_IPI), 641 MSR_TYPE_W); 642 } 643 } 644 645 /* 646 * Always check vmcs01's bitmap to honor userspace MSR filters and any 647 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 648 */ 649 #ifdef CONFIG_X86_64 650 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 651 MSR_FS_BASE, MSR_TYPE_RW); 652 653 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 654 MSR_GS_BASE, MSR_TYPE_RW); 655 656 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 657 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 658 #endif 659 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 660 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 661 662 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 663 MSR_IA32_PRED_CMD, MSR_TYPE_W); 664 665 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 666 667 return true; 668 } 669 670 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 671 struct vmcs12 *vmcs12) 672 { 673 struct vcpu_vmx *vmx = to_vmx(vcpu); 674 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 675 676 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 677 vmcs12->vmcs_link_pointer == INVALID_GPA) 678 return; 679 680 if (ghc->gpa != vmcs12->vmcs_link_pointer && 681 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 682 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 683 return; 684 685 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 686 VMCS12_SIZE); 687 } 688 689 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 690 struct vmcs12 *vmcs12) 691 { 692 struct vcpu_vmx *vmx = to_vmx(vcpu); 693 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 694 695 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 696 vmcs12->vmcs_link_pointer == INVALID_GPA) 697 return; 698 699 if (ghc->gpa != vmcs12->vmcs_link_pointer && 700 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 701 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 702 return; 703 704 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 705 VMCS12_SIZE); 706 } 707 708 /* 709 * In nested virtualization, check if L1 has set 710 * VM_EXIT_ACK_INTR_ON_EXIT 711 */ 712 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 713 { 714 return get_vmcs12(vcpu)->vm_exit_controls & 715 VM_EXIT_ACK_INTR_ON_EXIT; 716 } 717 718 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 719 struct vmcs12 *vmcs12) 720 { 721 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 722 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 723 return -EINVAL; 724 else 725 return 0; 726 } 727 728 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 729 struct vmcs12 *vmcs12) 730 { 731 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 732 !nested_cpu_has_apic_reg_virt(vmcs12) && 733 !nested_cpu_has_vid(vmcs12) && 734 !nested_cpu_has_posted_intr(vmcs12)) 735 return 0; 736 737 /* 738 * If virtualize x2apic mode is enabled, 739 * virtualize apic access must be disabled. 740 */ 741 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 742 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 743 return -EINVAL; 744 745 /* 746 * If virtual interrupt delivery is enabled, 747 * we must exit on external interrupts. 748 */ 749 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 750 return -EINVAL; 751 752 /* 753 * bits 15:8 should be zero in posted_intr_nv, 754 * the descriptor address has been already checked 755 * in nested_get_vmcs12_pages. 756 * 757 * bits 5:0 of posted_intr_desc_addr should be zero. 758 */ 759 if (nested_cpu_has_posted_intr(vmcs12) && 760 (CC(!nested_cpu_has_vid(vmcs12)) || 761 CC(!nested_exit_intr_ack_set(vcpu)) || 762 CC((vmcs12->posted_intr_nv & 0xff00)) || 763 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 764 return -EINVAL; 765 766 /* tpr shadow is needed by all apicv features. */ 767 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 768 return -EINVAL; 769 770 return 0; 771 } 772 773 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 774 u32 count, u64 addr) 775 { 776 if (count == 0) 777 return 0; 778 779 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 780 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 781 return -EINVAL; 782 783 return 0; 784 } 785 786 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 787 struct vmcs12 *vmcs12) 788 { 789 if (CC(nested_vmx_check_msr_switch(vcpu, 790 vmcs12->vm_exit_msr_load_count, 791 vmcs12->vm_exit_msr_load_addr)) || 792 CC(nested_vmx_check_msr_switch(vcpu, 793 vmcs12->vm_exit_msr_store_count, 794 vmcs12->vm_exit_msr_store_addr))) 795 return -EINVAL; 796 797 return 0; 798 } 799 800 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 801 struct vmcs12 *vmcs12) 802 { 803 if (CC(nested_vmx_check_msr_switch(vcpu, 804 vmcs12->vm_entry_msr_load_count, 805 vmcs12->vm_entry_msr_load_addr))) 806 return -EINVAL; 807 808 return 0; 809 } 810 811 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 812 struct vmcs12 *vmcs12) 813 { 814 if (!nested_cpu_has_pml(vmcs12)) 815 return 0; 816 817 if (CC(!nested_cpu_has_ept(vmcs12)) || 818 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 819 return -EINVAL; 820 821 return 0; 822 } 823 824 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 825 struct vmcs12 *vmcs12) 826 { 827 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 828 !nested_cpu_has_ept(vmcs12))) 829 return -EINVAL; 830 return 0; 831 } 832 833 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 834 struct vmcs12 *vmcs12) 835 { 836 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 837 !nested_cpu_has_ept(vmcs12))) 838 return -EINVAL; 839 return 0; 840 } 841 842 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 843 struct vmcs12 *vmcs12) 844 { 845 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 846 return 0; 847 848 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 849 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 850 return -EINVAL; 851 852 return 0; 853 } 854 855 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 856 struct vmx_msr_entry *e) 857 { 858 /* x2APIC MSR accesses are not allowed */ 859 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 860 return -EINVAL; 861 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 862 CC(e->index == MSR_IA32_UCODE_REV)) 863 return -EINVAL; 864 if (CC(e->reserved != 0)) 865 return -EINVAL; 866 return 0; 867 } 868 869 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 870 struct vmx_msr_entry *e) 871 { 872 if (CC(e->index == MSR_FS_BASE) || 873 CC(e->index == MSR_GS_BASE) || 874 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 875 nested_vmx_msr_check_common(vcpu, e)) 876 return -EINVAL; 877 return 0; 878 } 879 880 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 881 struct vmx_msr_entry *e) 882 { 883 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 884 nested_vmx_msr_check_common(vcpu, e)) 885 return -EINVAL; 886 return 0; 887 } 888 889 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 890 { 891 struct vcpu_vmx *vmx = to_vmx(vcpu); 892 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 893 vmx->nested.msrs.misc_high); 894 895 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 896 } 897 898 /* 899 * Load guest's/host's msr at nested entry/exit. 900 * return 0 for success, entry index for failure. 901 * 902 * One of the failure modes for MSR load/store is when a list exceeds the 903 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 904 * as possible, process all valid entries before failing rather than precheck 905 * for a capacity violation. 906 */ 907 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 908 { 909 u32 i; 910 struct vmx_msr_entry e; 911 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 912 913 for (i = 0; i < count; i++) { 914 if (unlikely(i >= max_msr_list_size)) 915 goto fail; 916 917 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 918 &e, sizeof(e))) { 919 pr_debug_ratelimited( 920 "%s cannot read MSR entry (%u, 0x%08llx)\n", 921 __func__, i, gpa + i * sizeof(e)); 922 goto fail; 923 } 924 if (nested_vmx_load_msr_check(vcpu, &e)) { 925 pr_debug_ratelimited( 926 "%s check failed (%u, 0x%x, 0x%x)\n", 927 __func__, i, e.index, e.reserved); 928 goto fail; 929 } 930 if (kvm_set_msr(vcpu, e.index, e.value)) { 931 pr_debug_ratelimited( 932 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 933 __func__, i, e.index, e.value); 934 goto fail; 935 } 936 } 937 return 0; 938 fail: 939 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 940 return i + 1; 941 } 942 943 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 944 u32 msr_index, 945 u64 *data) 946 { 947 struct vcpu_vmx *vmx = to_vmx(vcpu); 948 949 /* 950 * If the L0 hypervisor stored a more accurate value for the TSC that 951 * does not include the time taken for emulation of the L2->L1 952 * VM-exit in L0, use the more accurate value. 953 */ 954 if (msr_index == MSR_IA32_TSC) { 955 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 956 MSR_IA32_TSC); 957 958 if (i >= 0) { 959 u64 val = vmx->msr_autostore.guest.val[i].value; 960 961 *data = kvm_read_l1_tsc(vcpu, val); 962 return true; 963 } 964 } 965 966 if (kvm_get_msr(vcpu, msr_index, data)) { 967 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 968 msr_index); 969 return false; 970 } 971 return true; 972 } 973 974 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 975 struct vmx_msr_entry *e) 976 { 977 if (kvm_vcpu_read_guest(vcpu, 978 gpa + i * sizeof(*e), 979 e, 2 * sizeof(u32))) { 980 pr_debug_ratelimited( 981 "%s cannot read MSR entry (%u, 0x%08llx)\n", 982 __func__, i, gpa + i * sizeof(*e)); 983 return false; 984 } 985 if (nested_vmx_store_msr_check(vcpu, e)) { 986 pr_debug_ratelimited( 987 "%s check failed (%u, 0x%x, 0x%x)\n", 988 __func__, i, e->index, e->reserved); 989 return false; 990 } 991 return true; 992 } 993 994 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 995 { 996 u64 data; 997 u32 i; 998 struct vmx_msr_entry e; 999 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1000 1001 for (i = 0; i < count; i++) { 1002 if (unlikely(i >= max_msr_list_size)) 1003 return -EINVAL; 1004 1005 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1006 return -EINVAL; 1007 1008 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1009 return -EINVAL; 1010 1011 if (kvm_vcpu_write_guest(vcpu, 1012 gpa + i * sizeof(e) + 1013 offsetof(struct vmx_msr_entry, value), 1014 &data, sizeof(data))) { 1015 pr_debug_ratelimited( 1016 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1017 __func__, i, e.index, data); 1018 return -EINVAL; 1019 } 1020 } 1021 return 0; 1022 } 1023 1024 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1025 { 1026 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1027 u32 count = vmcs12->vm_exit_msr_store_count; 1028 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1029 struct vmx_msr_entry e; 1030 u32 i; 1031 1032 for (i = 0; i < count; i++) { 1033 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1034 return false; 1035 1036 if (e.index == msr_index) 1037 return true; 1038 } 1039 return false; 1040 } 1041 1042 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1043 u32 msr_index) 1044 { 1045 struct vcpu_vmx *vmx = to_vmx(vcpu); 1046 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1047 bool in_vmcs12_store_list; 1048 int msr_autostore_slot; 1049 bool in_autostore_list; 1050 int last; 1051 1052 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1053 in_autostore_list = msr_autostore_slot >= 0; 1054 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1055 1056 if (in_vmcs12_store_list && !in_autostore_list) { 1057 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1058 /* 1059 * Emulated VMEntry does not fail here. Instead a less 1060 * accurate value will be returned by 1061 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1062 * instead of reading the value from the vmcs02 VMExit 1063 * MSR-store area. 1064 */ 1065 pr_warn_ratelimited( 1066 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1067 msr_index); 1068 return; 1069 } 1070 last = autostore->nr++; 1071 autostore->val[last].index = msr_index; 1072 } else if (!in_vmcs12_store_list && in_autostore_list) { 1073 last = --autostore->nr; 1074 autostore->val[msr_autostore_slot] = autostore->val[last]; 1075 } 1076 } 1077 1078 /* 1079 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1080 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1081 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1082 * @entry_failure_code. 1083 */ 1084 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1085 bool nested_ept, bool reload_pdptrs, 1086 enum vm_entry_failure_code *entry_failure_code) 1087 { 1088 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1089 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1090 return -EINVAL; 1091 } 1092 1093 /* 1094 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1095 * must not be dereferenced. 1096 */ 1097 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1098 CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1099 *entry_failure_code = ENTRY_FAIL_PDPTE; 1100 return -EINVAL; 1101 } 1102 1103 if (!nested_ept) 1104 kvm_mmu_new_pgd(vcpu, cr3); 1105 1106 vcpu->arch.cr3 = cr3; 1107 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1108 1109 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1110 kvm_init_mmu(vcpu); 1111 1112 return 0; 1113 } 1114 1115 /* 1116 * Returns if KVM is able to config CPU to tag TLB entries 1117 * populated by L2 differently than TLB entries populated 1118 * by L1. 1119 * 1120 * If L0 uses EPT, L1 and L2 run with different EPTP because 1121 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1122 * are tagged with different EPTP. 1123 * 1124 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1125 * with different VPID (L1 entries are tagged with vmx->vpid 1126 * while L2 entries are tagged with vmx->nested.vpid02). 1127 */ 1128 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1129 { 1130 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1131 1132 return enable_ept || 1133 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1134 } 1135 1136 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1137 struct vmcs12 *vmcs12, 1138 bool is_vmenter) 1139 { 1140 struct vcpu_vmx *vmx = to_vmx(vcpu); 1141 1142 /* 1143 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1144 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1145 * full TLB flush from the guest's perspective. This is required even 1146 * if VPID is disabled in the host as KVM may need to synchronize the 1147 * MMU in response to the guest TLB flush. 1148 * 1149 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1150 * EPT is a special snowflake, as guest-physical mappings aren't 1151 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1152 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1153 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1154 * those mappings. 1155 */ 1156 if (!nested_cpu_has_vpid(vmcs12)) { 1157 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1158 return; 1159 } 1160 1161 /* L2 should never have a VPID if VPID is disabled. */ 1162 WARN_ON(!enable_vpid); 1163 1164 /* 1165 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1166 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1167 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1168 * that the new vpid12 has never been used and thus represents a new 1169 * guest ASID that cannot have entries in the TLB. 1170 */ 1171 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1172 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1173 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1174 return; 1175 } 1176 1177 /* 1178 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1179 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1180 * KVM was unable to allocate a VPID for L2, flush the current context 1181 * as the effective ASID is common to both L1 and L2. 1182 */ 1183 if (!nested_has_guest_tlb_tag(vcpu)) 1184 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1185 } 1186 1187 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1188 { 1189 superset &= mask; 1190 subset &= mask; 1191 1192 return (superset | subset) == superset; 1193 } 1194 1195 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1196 { 1197 const u64 feature_and_reserved = 1198 /* feature (except bit 48; see below) */ 1199 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1200 /* reserved */ 1201 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1202 u64 vmx_basic = vmx->nested.msrs.basic; 1203 1204 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1205 return -EINVAL; 1206 1207 /* 1208 * KVM does not emulate a version of VMX that constrains physical 1209 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1210 */ 1211 if (data & BIT_ULL(48)) 1212 return -EINVAL; 1213 1214 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1215 vmx_basic_vmcs_revision_id(data)) 1216 return -EINVAL; 1217 1218 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1219 return -EINVAL; 1220 1221 vmx->nested.msrs.basic = data; 1222 return 0; 1223 } 1224 1225 static int 1226 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1227 { 1228 u64 supported; 1229 u32 *lowp, *highp; 1230 1231 switch (msr_index) { 1232 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1233 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1234 highp = &vmx->nested.msrs.pinbased_ctls_high; 1235 break; 1236 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1237 lowp = &vmx->nested.msrs.procbased_ctls_low; 1238 highp = &vmx->nested.msrs.procbased_ctls_high; 1239 break; 1240 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1241 lowp = &vmx->nested.msrs.exit_ctls_low; 1242 highp = &vmx->nested.msrs.exit_ctls_high; 1243 break; 1244 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1245 lowp = &vmx->nested.msrs.entry_ctls_low; 1246 highp = &vmx->nested.msrs.entry_ctls_high; 1247 break; 1248 case MSR_IA32_VMX_PROCBASED_CTLS2: 1249 lowp = &vmx->nested.msrs.secondary_ctls_low; 1250 highp = &vmx->nested.msrs.secondary_ctls_high; 1251 break; 1252 default: 1253 BUG(); 1254 } 1255 1256 supported = vmx_control_msr(*lowp, *highp); 1257 1258 /* Check must-be-1 bits are still 1. */ 1259 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1260 return -EINVAL; 1261 1262 /* Check must-be-0 bits are still 0. */ 1263 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1264 return -EINVAL; 1265 1266 *lowp = data; 1267 *highp = data >> 32; 1268 return 0; 1269 } 1270 1271 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1272 { 1273 const u64 feature_and_reserved_bits = 1274 /* feature */ 1275 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1276 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1277 /* reserved */ 1278 GENMASK_ULL(13, 9) | BIT_ULL(31); 1279 u64 vmx_misc; 1280 1281 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1282 vmx->nested.msrs.misc_high); 1283 1284 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1285 return -EINVAL; 1286 1287 if ((vmx->nested.msrs.pinbased_ctls_high & 1288 PIN_BASED_VMX_PREEMPTION_TIMER) && 1289 vmx_misc_preemption_timer_rate(data) != 1290 vmx_misc_preemption_timer_rate(vmx_misc)) 1291 return -EINVAL; 1292 1293 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1294 return -EINVAL; 1295 1296 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1297 return -EINVAL; 1298 1299 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1300 return -EINVAL; 1301 1302 vmx->nested.msrs.misc_low = data; 1303 vmx->nested.msrs.misc_high = data >> 32; 1304 1305 return 0; 1306 } 1307 1308 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1309 { 1310 u64 vmx_ept_vpid_cap; 1311 1312 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1313 vmx->nested.msrs.vpid_caps); 1314 1315 /* Every bit is either reserved or a feature bit. */ 1316 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1317 return -EINVAL; 1318 1319 vmx->nested.msrs.ept_caps = data; 1320 vmx->nested.msrs.vpid_caps = data >> 32; 1321 return 0; 1322 } 1323 1324 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1325 { 1326 u64 *msr; 1327 1328 switch (msr_index) { 1329 case MSR_IA32_VMX_CR0_FIXED0: 1330 msr = &vmx->nested.msrs.cr0_fixed0; 1331 break; 1332 case MSR_IA32_VMX_CR4_FIXED0: 1333 msr = &vmx->nested.msrs.cr4_fixed0; 1334 break; 1335 default: 1336 BUG(); 1337 } 1338 1339 /* 1340 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1341 * must be 1 in the restored value. 1342 */ 1343 if (!is_bitwise_subset(data, *msr, -1ULL)) 1344 return -EINVAL; 1345 1346 *msr = data; 1347 return 0; 1348 } 1349 1350 /* 1351 * Called when userspace is restoring VMX MSRs. 1352 * 1353 * Returns 0 on success, non-0 otherwise. 1354 */ 1355 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1356 { 1357 struct vcpu_vmx *vmx = to_vmx(vcpu); 1358 1359 /* 1360 * Don't allow changes to the VMX capability MSRs while the vCPU 1361 * is in VMX operation. 1362 */ 1363 if (vmx->nested.vmxon) 1364 return -EBUSY; 1365 1366 switch (msr_index) { 1367 case MSR_IA32_VMX_BASIC: 1368 return vmx_restore_vmx_basic(vmx, data); 1369 case MSR_IA32_VMX_PINBASED_CTLS: 1370 case MSR_IA32_VMX_PROCBASED_CTLS: 1371 case MSR_IA32_VMX_EXIT_CTLS: 1372 case MSR_IA32_VMX_ENTRY_CTLS: 1373 /* 1374 * The "non-true" VMX capability MSRs are generated from the 1375 * "true" MSRs, so we do not support restoring them directly. 1376 * 1377 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1378 * should restore the "true" MSRs with the must-be-1 bits 1379 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1380 * DEFAULT SETTINGS". 1381 */ 1382 return -EINVAL; 1383 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1384 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1385 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1386 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1387 case MSR_IA32_VMX_PROCBASED_CTLS2: 1388 return vmx_restore_control_msr(vmx, msr_index, data); 1389 case MSR_IA32_VMX_MISC: 1390 return vmx_restore_vmx_misc(vmx, data); 1391 case MSR_IA32_VMX_CR0_FIXED0: 1392 case MSR_IA32_VMX_CR4_FIXED0: 1393 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1394 case MSR_IA32_VMX_CR0_FIXED1: 1395 case MSR_IA32_VMX_CR4_FIXED1: 1396 /* 1397 * These MSRs are generated based on the vCPU's CPUID, so we 1398 * do not support restoring them directly. 1399 */ 1400 return -EINVAL; 1401 case MSR_IA32_VMX_EPT_VPID_CAP: 1402 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1403 case MSR_IA32_VMX_VMCS_ENUM: 1404 vmx->nested.msrs.vmcs_enum = data; 1405 return 0; 1406 case MSR_IA32_VMX_VMFUNC: 1407 if (data & ~vmx->nested.msrs.vmfunc_controls) 1408 return -EINVAL; 1409 vmx->nested.msrs.vmfunc_controls = data; 1410 return 0; 1411 default: 1412 /* 1413 * The rest of the VMX capability MSRs do not support restore. 1414 */ 1415 return -EINVAL; 1416 } 1417 } 1418 1419 /* Returns 0 on success, non-0 otherwise. */ 1420 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1421 { 1422 switch (msr_index) { 1423 case MSR_IA32_VMX_BASIC: 1424 *pdata = msrs->basic; 1425 break; 1426 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1427 case MSR_IA32_VMX_PINBASED_CTLS: 1428 *pdata = vmx_control_msr( 1429 msrs->pinbased_ctls_low, 1430 msrs->pinbased_ctls_high); 1431 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1432 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1433 break; 1434 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1435 case MSR_IA32_VMX_PROCBASED_CTLS: 1436 *pdata = vmx_control_msr( 1437 msrs->procbased_ctls_low, 1438 msrs->procbased_ctls_high); 1439 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1440 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1441 break; 1442 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1443 case MSR_IA32_VMX_EXIT_CTLS: 1444 *pdata = vmx_control_msr( 1445 msrs->exit_ctls_low, 1446 msrs->exit_ctls_high); 1447 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1448 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1449 break; 1450 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1451 case MSR_IA32_VMX_ENTRY_CTLS: 1452 *pdata = vmx_control_msr( 1453 msrs->entry_ctls_low, 1454 msrs->entry_ctls_high); 1455 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1456 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1457 break; 1458 case MSR_IA32_VMX_MISC: 1459 *pdata = vmx_control_msr( 1460 msrs->misc_low, 1461 msrs->misc_high); 1462 break; 1463 case MSR_IA32_VMX_CR0_FIXED0: 1464 *pdata = msrs->cr0_fixed0; 1465 break; 1466 case MSR_IA32_VMX_CR0_FIXED1: 1467 *pdata = msrs->cr0_fixed1; 1468 break; 1469 case MSR_IA32_VMX_CR4_FIXED0: 1470 *pdata = msrs->cr4_fixed0; 1471 break; 1472 case MSR_IA32_VMX_CR4_FIXED1: 1473 *pdata = msrs->cr4_fixed1; 1474 break; 1475 case MSR_IA32_VMX_VMCS_ENUM: 1476 *pdata = msrs->vmcs_enum; 1477 break; 1478 case MSR_IA32_VMX_PROCBASED_CTLS2: 1479 *pdata = vmx_control_msr( 1480 msrs->secondary_ctls_low, 1481 msrs->secondary_ctls_high); 1482 break; 1483 case MSR_IA32_VMX_EPT_VPID_CAP: 1484 *pdata = msrs->ept_caps | 1485 ((u64)msrs->vpid_caps << 32); 1486 break; 1487 case MSR_IA32_VMX_VMFUNC: 1488 *pdata = msrs->vmfunc_controls; 1489 break; 1490 default: 1491 return 1; 1492 } 1493 1494 return 0; 1495 } 1496 1497 /* 1498 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1499 * been modified by the L1 guest. Note, "writable" in this context means 1500 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1501 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1502 * VM-exit information fields (which are actually writable if the vCPU is 1503 * configured to support "VMWRITE to any supported field in the VMCS"). 1504 */ 1505 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1506 { 1507 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1508 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1509 struct shadow_vmcs_field field; 1510 unsigned long val; 1511 int i; 1512 1513 if (WARN_ON(!shadow_vmcs)) 1514 return; 1515 1516 preempt_disable(); 1517 1518 vmcs_load(shadow_vmcs); 1519 1520 for (i = 0; i < max_shadow_read_write_fields; i++) { 1521 field = shadow_read_write_fields[i]; 1522 val = __vmcs_readl(field.encoding); 1523 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1524 } 1525 1526 vmcs_clear(shadow_vmcs); 1527 vmcs_load(vmx->loaded_vmcs->vmcs); 1528 1529 preempt_enable(); 1530 } 1531 1532 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1533 { 1534 const struct shadow_vmcs_field *fields[] = { 1535 shadow_read_write_fields, 1536 shadow_read_only_fields 1537 }; 1538 const int max_fields[] = { 1539 max_shadow_read_write_fields, 1540 max_shadow_read_only_fields 1541 }; 1542 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1543 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1544 struct shadow_vmcs_field field; 1545 unsigned long val; 1546 int i, q; 1547 1548 if (WARN_ON(!shadow_vmcs)) 1549 return; 1550 1551 vmcs_load(shadow_vmcs); 1552 1553 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1554 for (i = 0; i < max_fields[q]; i++) { 1555 field = fields[q][i]; 1556 val = vmcs12_read_any(vmcs12, field.encoding, 1557 field.offset); 1558 __vmcs_writel(field.encoding, val); 1559 } 1560 } 1561 1562 vmcs_clear(shadow_vmcs); 1563 vmcs_load(vmx->loaded_vmcs->vmcs); 1564 } 1565 1566 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1567 { 1568 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1569 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1570 1571 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1572 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1573 vmcs12->guest_rip = evmcs->guest_rip; 1574 1575 if (unlikely(!(hv_clean_fields & 1576 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1577 vmcs12->guest_rsp = evmcs->guest_rsp; 1578 vmcs12->guest_rflags = evmcs->guest_rflags; 1579 vmcs12->guest_interruptibility_info = 1580 evmcs->guest_interruptibility_info; 1581 } 1582 1583 if (unlikely(!(hv_clean_fields & 1584 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1585 vmcs12->cpu_based_vm_exec_control = 1586 evmcs->cpu_based_vm_exec_control; 1587 } 1588 1589 if (unlikely(!(hv_clean_fields & 1590 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1591 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1592 } 1593 1594 if (unlikely(!(hv_clean_fields & 1595 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1596 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1597 } 1598 1599 if (unlikely(!(hv_clean_fields & 1600 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1601 vmcs12->vm_entry_intr_info_field = 1602 evmcs->vm_entry_intr_info_field; 1603 vmcs12->vm_entry_exception_error_code = 1604 evmcs->vm_entry_exception_error_code; 1605 vmcs12->vm_entry_instruction_len = 1606 evmcs->vm_entry_instruction_len; 1607 } 1608 1609 if (unlikely(!(hv_clean_fields & 1610 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1611 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1612 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1613 vmcs12->host_cr0 = evmcs->host_cr0; 1614 vmcs12->host_cr3 = evmcs->host_cr3; 1615 vmcs12->host_cr4 = evmcs->host_cr4; 1616 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1617 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1618 vmcs12->host_rip = evmcs->host_rip; 1619 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1620 vmcs12->host_es_selector = evmcs->host_es_selector; 1621 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1622 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1623 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1624 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1625 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1626 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1627 } 1628 1629 if (unlikely(!(hv_clean_fields & 1630 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1631 vmcs12->pin_based_vm_exec_control = 1632 evmcs->pin_based_vm_exec_control; 1633 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1634 vmcs12->secondary_vm_exec_control = 1635 evmcs->secondary_vm_exec_control; 1636 } 1637 1638 if (unlikely(!(hv_clean_fields & 1639 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1640 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1641 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1642 } 1643 1644 if (unlikely(!(hv_clean_fields & 1645 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1646 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1647 } 1648 1649 if (unlikely(!(hv_clean_fields & 1650 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1651 vmcs12->guest_es_base = evmcs->guest_es_base; 1652 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1653 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1654 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1655 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1656 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1657 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1658 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1659 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1660 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1661 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1662 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1663 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1664 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1665 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1666 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1667 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1668 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1669 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1670 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1671 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1672 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1673 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1674 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1675 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1676 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1677 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1678 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1679 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1680 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1681 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1682 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1683 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1684 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1685 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1686 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1687 } 1688 1689 if (unlikely(!(hv_clean_fields & 1690 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1691 vmcs12->tsc_offset = evmcs->tsc_offset; 1692 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1693 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1694 } 1695 1696 if (unlikely(!(hv_clean_fields & 1697 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1698 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1699 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1700 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1701 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1702 vmcs12->guest_cr0 = evmcs->guest_cr0; 1703 vmcs12->guest_cr3 = evmcs->guest_cr3; 1704 vmcs12->guest_cr4 = evmcs->guest_cr4; 1705 vmcs12->guest_dr7 = evmcs->guest_dr7; 1706 } 1707 1708 if (unlikely(!(hv_clean_fields & 1709 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1710 vmcs12->host_fs_base = evmcs->host_fs_base; 1711 vmcs12->host_gs_base = evmcs->host_gs_base; 1712 vmcs12->host_tr_base = evmcs->host_tr_base; 1713 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1714 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1715 vmcs12->host_rsp = evmcs->host_rsp; 1716 } 1717 1718 if (unlikely(!(hv_clean_fields & 1719 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1720 vmcs12->ept_pointer = evmcs->ept_pointer; 1721 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1722 } 1723 1724 if (unlikely(!(hv_clean_fields & 1725 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1726 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1727 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1728 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1729 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1730 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1731 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1732 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1733 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1734 vmcs12->guest_pending_dbg_exceptions = 1735 evmcs->guest_pending_dbg_exceptions; 1736 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1737 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1738 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1739 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1740 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1741 } 1742 1743 /* 1744 * Not used? 1745 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1746 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1747 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1748 * vmcs12->page_fault_error_code_mask = 1749 * evmcs->page_fault_error_code_mask; 1750 * vmcs12->page_fault_error_code_match = 1751 * evmcs->page_fault_error_code_match; 1752 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1753 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1754 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1755 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1756 */ 1757 1758 /* 1759 * Read only fields: 1760 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1761 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1762 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1763 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1764 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1765 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1766 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1767 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1768 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1769 * vmcs12->exit_qualification = evmcs->exit_qualification; 1770 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1771 * 1772 * Not present in struct vmcs12: 1773 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1774 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1775 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1776 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1777 */ 1778 1779 return; 1780 } 1781 1782 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1783 { 1784 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1785 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1786 1787 /* 1788 * Should not be changed by KVM: 1789 * 1790 * evmcs->host_es_selector = vmcs12->host_es_selector; 1791 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1792 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1793 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1794 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1795 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1796 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1797 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1798 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1799 * evmcs->host_cr0 = vmcs12->host_cr0; 1800 * evmcs->host_cr3 = vmcs12->host_cr3; 1801 * evmcs->host_cr4 = vmcs12->host_cr4; 1802 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1803 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1804 * evmcs->host_rip = vmcs12->host_rip; 1805 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1806 * evmcs->host_fs_base = vmcs12->host_fs_base; 1807 * evmcs->host_gs_base = vmcs12->host_gs_base; 1808 * evmcs->host_tr_base = vmcs12->host_tr_base; 1809 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1810 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1811 * evmcs->host_rsp = vmcs12->host_rsp; 1812 * sync_vmcs02_to_vmcs12() doesn't read these: 1813 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1814 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1815 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1816 * evmcs->ept_pointer = vmcs12->ept_pointer; 1817 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1818 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1819 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1820 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1821 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1822 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1823 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1824 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1825 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1826 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1827 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1828 * evmcs->page_fault_error_code_mask = 1829 * vmcs12->page_fault_error_code_mask; 1830 * evmcs->page_fault_error_code_match = 1831 * vmcs12->page_fault_error_code_match; 1832 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1833 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1834 * evmcs->tsc_offset = vmcs12->tsc_offset; 1835 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1836 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1837 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1838 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1839 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1840 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1841 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1842 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1843 * 1844 * Not present in struct vmcs12: 1845 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1846 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1847 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1848 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1849 */ 1850 1851 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1852 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1853 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1854 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1855 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1856 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1857 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1858 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1859 1860 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1861 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1862 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1863 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1864 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1865 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1866 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1867 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1868 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1869 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1870 1871 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1872 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1873 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1874 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1875 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1876 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1877 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1878 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1879 1880 evmcs->guest_es_base = vmcs12->guest_es_base; 1881 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1882 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1883 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1884 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1885 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1886 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1887 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1888 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1889 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1890 1891 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1892 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1893 1894 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1895 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1896 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1897 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1898 1899 evmcs->guest_pending_dbg_exceptions = 1900 vmcs12->guest_pending_dbg_exceptions; 1901 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1902 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1903 1904 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1905 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1906 1907 evmcs->guest_cr0 = vmcs12->guest_cr0; 1908 evmcs->guest_cr3 = vmcs12->guest_cr3; 1909 evmcs->guest_cr4 = vmcs12->guest_cr4; 1910 evmcs->guest_dr7 = vmcs12->guest_dr7; 1911 1912 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1913 1914 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1915 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1916 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1917 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1918 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1919 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1920 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1921 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1922 1923 evmcs->exit_qualification = vmcs12->exit_qualification; 1924 1925 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1926 evmcs->guest_rsp = vmcs12->guest_rsp; 1927 evmcs->guest_rflags = vmcs12->guest_rflags; 1928 1929 evmcs->guest_interruptibility_info = 1930 vmcs12->guest_interruptibility_info; 1931 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1932 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1933 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1934 evmcs->vm_entry_exception_error_code = 1935 vmcs12->vm_entry_exception_error_code; 1936 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1937 1938 evmcs->guest_rip = vmcs12->guest_rip; 1939 1940 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1941 1942 return; 1943 } 1944 1945 /* 1946 * This is an equivalent of the nested hypervisor executing the vmptrld 1947 * instruction. 1948 */ 1949 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1950 struct kvm_vcpu *vcpu, bool from_launch) 1951 { 1952 struct vcpu_vmx *vmx = to_vmx(vcpu); 1953 bool evmcs_gpa_changed = false; 1954 u64 evmcs_gpa; 1955 1956 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1957 return EVMPTRLD_DISABLED; 1958 1959 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { 1960 nested_release_evmcs(vcpu); 1961 return EVMPTRLD_DISABLED; 1962 } 1963 1964 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1965 vmx->nested.current_vmptr = INVALID_GPA; 1966 1967 nested_release_evmcs(vcpu); 1968 1969 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1970 &vmx->nested.hv_evmcs_map)) 1971 return EVMPTRLD_ERROR; 1972 1973 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1974 1975 /* 1976 * Currently, KVM only supports eVMCS version 1 1977 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 1978 * value to first u32 field of eVMCS which should specify eVMCS 1979 * VersionNumber. 1980 * 1981 * Guest should be aware of supported eVMCS versions by host by 1982 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 1983 * expected to set this CPUID leaf according to the value 1984 * returned in vmcs_version from nested_enable_evmcs(). 1985 * 1986 * However, it turns out that Microsoft Hyper-V fails to comply 1987 * to their own invented interface: When Hyper-V use eVMCS, it 1988 * just sets first u32 field of eVMCS to revision_id specified 1989 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 1990 * which is one of the supported versions specified in 1991 * CPUID.0x4000000A.EAX[0:15]. 1992 * 1993 * To overcome Hyper-V bug, we accept here either a supported 1994 * eVMCS version or VMCS12 revision_id as valid values for first 1995 * u32 field of eVMCS. 1996 */ 1997 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 1998 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 1999 nested_release_evmcs(vcpu); 2000 return EVMPTRLD_VMFAIL; 2001 } 2002 2003 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2004 2005 evmcs_gpa_changed = true; 2006 /* 2007 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2008 * reloaded from guest's memory (read only fields, fields not 2009 * present in struct hv_enlightened_vmcs, ...). Make sure there 2010 * are no leftovers. 2011 */ 2012 if (from_launch) { 2013 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2014 memset(vmcs12, 0, sizeof(*vmcs12)); 2015 vmcs12->hdr.revision_id = VMCS12_REVISION; 2016 } 2017 2018 } 2019 2020 /* 2021 * Clean fields data can't be used on VMLAUNCH and when we switch 2022 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2023 */ 2024 if (from_launch || evmcs_gpa_changed) 2025 vmx->nested.hv_evmcs->hv_clean_fields &= 2026 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2027 2028 return EVMPTRLD_SUCCEEDED; 2029 } 2030 2031 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2032 { 2033 struct vcpu_vmx *vmx = to_vmx(vcpu); 2034 2035 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2036 copy_vmcs12_to_enlightened(vmx); 2037 else 2038 copy_vmcs12_to_shadow(vmx); 2039 2040 vmx->nested.need_vmcs12_to_shadow_sync = false; 2041 } 2042 2043 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2044 { 2045 struct vcpu_vmx *vmx = 2046 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2047 2048 vmx->nested.preemption_timer_expired = true; 2049 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2050 kvm_vcpu_kick(&vmx->vcpu); 2051 2052 return HRTIMER_NORESTART; 2053 } 2054 2055 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2056 { 2057 struct vcpu_vmx *vmx = to_vmx(vcpu); 2058 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2059 2060 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2061 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2062 2063 if (!vmx->nested.has_preemption_timer_deadline) { 2064 vmx->nested.preemption_timer_deadline = 2065 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2066 vmx->nested.has_preemption_timer_deadline = true; 2067 } 2068 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2069 } 2070 2071 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2072 u64 preemption_timeout) 2073 { 2074 struct vcpu_vmx *vmx = to_vmx(vcpu); 2075 2076 /* 2077 * A timer value of zero is architecturally guaranteed to cause 2078 * a VMExit prior to executing any instructions in the guest. 2079 */ 2080 if (preemption_timeout == 0) { 2081 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2082 return; 2083 } 2084 2085 if (vcpu->arch.virtual_tsc_khz == 0) 2086 return; 2087 2088 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2089 preemption_timeout *= 1000000; 2090 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2091 hrtimer_start(&vmx->nested.preemption_timer, 2092 ktime_add_ns(ktime_get(), preemption_timeout), 2093 HRTIMER_MODE_ABS_PINNED); 2094 } 2095 2096 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2097 { 2098 if (vmx->nested.nested_run_pending && 2099 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2100 return vmcs12->guest_ia32_efer; 2101 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2102 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2103 else 2104 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2105 } 2106 2107 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2108 { 2109 /* 2110 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2111 * according to L0's settings (vmcs12 is irrelevant here). Host 2112 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2113 * will be set as needed prior to VMLAUNCH/VMRESUME. 2114 */ 2115 if (vmx->nested.vmcs02_initialized) 2116 return; 2117 vmx->nested.vmcs02_initialized = true; 2118 2119 /* 2120 * We don't care what the EPTP value is we just need to guarantee 2121 * it's valid so we don't get a false positive when doing early 2122 * consistency checks. 2123 */ 2124 if (enable_ept && nested_early_check) 2125 vmcs_write64(EPT_POINTER, 2126 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2127 2128 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2129 if (cpu_has_vmx_vmfunc()) 2130 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2131 2132 if (cpu_has_vmx_posted_intr()) 2133 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2134 2135 if (cpu_has_vmx_msr_bitmap()) 2136 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2137 2138 /* 2139 * PML is emulated for L2, but never enabled in hardware as the MMU 2140 * handles A/D emulation. Disabling PML for L2 also avoids having to 2141 * deal with filtering out L2 GPAs from the buffer. 2142 */ 2143 if (enable_pml) { 2144 vmcs_write64(PML_ADDRESS, 0); 2145 vmcs_write16(GUEST_PML_INDEX, -1); 2146 } 2147 2148 if (cpu_has_vmx_encls_vmexit()) 2149 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2150 2151 /* 2152 * Set the MSR load/store lists to match L0's settings. Only the 2153 * addresses are constant (for vmcs02), the counts can change based 2154 * on L2's behavior, e.g. switching to/from long mode. 2155 */ 2156 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2157 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2158 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2159 2160 vmx_set_constant_host_state(vmx); 2161 } 2162 2163 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2164 struct vmcs12 *vmcs12) 2165 { 2166 prepare_vmcs02_constant_state(vmx); 2167 2168 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2169 2170 if (enable_vpid) { 2171 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2172 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2173 else 2174 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2175 } 2176 } 2177 2178 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2179 struct vmcs12 *vmcs12) 2180 { 2181 u32 exec_control; 2182 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2183 2184 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2185 prepare_vmcs02_early_rare(vmx, vmcs12); 2186 2187 /* 2188 * PIN CONTROLS 2189 */ 2190 exec_control = __pin_controls_get(vmcs01); 2191 exec_control |= (vmcs12->pin_based_vm_exec_control & 2192 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2193 2194 /* Posted interrupts setting is only taken from vmcs12. */ 2195 vmx->nested.pi_pending = false; 2196 if (nested_cpu_has_posted_intr(vmcs12)) 2197 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2198 else 2199 exec_control &= ~PIN_BASED_POSTED_INTR; 2200 pin_controls_set(vmx, exec_control); 2201 2202 /* 2203 * EXEC CONTROLS 2204 */ 2205 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2206 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2207 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2208 exec_control &= ~CPU_BASED_TPR_SHADOW; 2209 exec_control |= vmcs12->cpu_based_vm_exec_control; 2210 2211 vmx->nested.l1_tpr_threshold = -1; 2212 if (exec_control & CPU_BASED_TPR_SHADOW) 2213 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2214 #ifdef CONFIG_X86_64 2215 else 2216 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2217 CPU_BASED_CR8_STORE_EXITING; 2218 #endif 2219 2220 /* 2221 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2222 * for I/O port accesses. 2223 */ 2224 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2225 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2226 2227 /* 2228 * This bit will be computed in nested_get_vmcs12_pages, because 2229 * we do not have access to L1's MSR bitmap yet. For now, keep 2230 * the same bit as before, hoping to avoid multiple VMWRITEs that 2231 * only set/clear this bit. 2232 */ 2233 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2234 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2235 2236 exec_controls_set(vmx, exec_control); 2237 2238 /* 2239 * SECONDARY EXEC CONTROLS 2240 */ 2241 if (cpu_has_secondary_exec_ctrls()) { 2242 exec_control = __secondary_exec_controls_get(vmcs01); 2243 2244 /* Take the following fields only from vmcs12 */ 2245 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2246 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2247 SECONDARY_EXEC_ENABLE_INVPCID | 2248 SECONDARY_EXEC_ENABLE_RDTSCP | 2249 SECONDARY_EXEC_XSAVES | 2250 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2251 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2252 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2253 SECONDARY_EXEC_ENABLE_VMFUNC | 2254 SECONDARY_EXEC_TSC_SCALING | 2255 SECONDARY_EXEC_DESC); 2256 2257 if (nested_cpu_has(vmcs12, 2258 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2259 exec_control |= vmcs12->secondary_vm_exec_control; 2260 2261 /* PML is emulated and never enabled in hardware for L2. */ 2262 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2263 2264 /* VMCS shadowing for L2 is emulated for now */ 2265 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2266 2267 /* 2268 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2269 * will not have to rewrite the controls just for this bit. 2270 */ 2271 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2272 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2273 exec_control |= SECONDARY_EXEC_DESC; 2274 2275 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2276 vmcs_write16(GUEST_INTR_STATUS, 2277 vmcs12->guest_intr_status); 2278 2279 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2280 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2281 2282 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2283 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2284 2285 secondary_exec_controls_set(vmx, exec_control); 2286 } 2287 2288 /* 2289 * ENTRY CONTROLS 2290 * 2291 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2292 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2293 * on the related bits (if supported by the CPU) in the hope that 2294 * we can avoid VMWrites during vmx_set_efer(). 2295 */ 2296 exec_control = __vm_entry_controls_get(vmcs01); 2297 exec_control |= vmcs12->vm_entry_controls; 2298 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2299 if (cpu_has_load_ia32_efer()) { 2300 if (guest_efer & EFER_LMA) 2301 exec_control |= VM_ENTRY_IA32E_MODE; 2302 if (guest_efer != host_efer) 2303 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2304 } 2305 vm_entry_controls_set(vmx, exec_control); 2306 2307 /* 2308 * EXIT CONTROLS 2309 * 2310 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2311 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2312 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2313 */ 2314 exec_control = __vm_exit_controls_get(vmcs01); 2315 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2316 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2317 else 2318 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2319 vm_exit_controls_set(vmx, exec_control); 2320 2321 /* 2322 * Interrupt/Exception Fields 2323 */ 2324 if (vmx->nested.nested_run_pending) { 2325 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2326 vmcs12->vm_entry_intr_info_field); 2327 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2328 vmcs12->vm_entry_exception_error_code); 2329 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2330 vmcs12->vm_entry_instruction_len); 2331 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2332 vmcs12->guest_interruptibility_info); 2333 vmx->loaded_vmcs->nmi_known_unmasked = 2334 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2335 } else { 2336 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2337 } 2338 } 2339 2340 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2341 { 2342 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2343 2344 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2345 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2346 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2347 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2348 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2349 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2350 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2351 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2352 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2353 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2354 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2355 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2356 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2357 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2358 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2359 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2360 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2361 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2362 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2363 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2364 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2365 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2366 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2367 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2368 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2369 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2370 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2371 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2372 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2373 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2374 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2375 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2376 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2377 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2378 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2379 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2380 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2381 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2382 2383 vmx->segment_cache.bitmask = 0; 2384 } 2385 2386 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2387 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2388 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2389 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2390 vmcs12->guest_pending_dbg_exceptions); 2391 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2392 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2393 2394 /* 2395 * L1 may access the L2's PDPTR, so save them to construct 2396 * vmcs12 2397 */ 2398 if (enable_ept) { 2399 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2400 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2401 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2402 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2403 } 2404 2405 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2406 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2407 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2408 } 2409 2410 if (nested_cpu_has_xsaves(vmcs12)) 2411 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2412 2413 /* 2414 * Whether page-faults are trapped is determined by a combination of 2415 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2416 * doesn't care about page faults then we should set all of these to 2417 * L1's desires. However, if L0 does care about (some) page faults, it 2418 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2419 * simply ask to exit on each and every L2 page fault. This is done by 2420 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2421 * Note that below we don't need special code to set EB.PF beyond the 2422 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2423 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2424 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2425 */ 2426 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2427 /* 2428 * TODO: if both L0 and L1 need the same MASK and MATCH, 2429 * go ahead and use it? 2430 */ 2431 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2432 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2433 } else { 2434 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2435 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2436 } 2437 2438 if (cpu_has_vmx_apicv()) { 2439 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2440 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2441 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2442 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2443 } 2444 2445 /* 2446 * Make sure the msr_autostore list is up to date before we set the 2447 * count in the vmcs02. 2448 */ 2449 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2450 2451 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2452 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2453 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2454 2455 set_cr4_guest_host_mask(vmx); 2456 } 2457 2458 /* 2459 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2460 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2461 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2462 * guest in a way that will both be appropriate to L1's requests, and our 2463 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2464 * function also has additional necessary side-effects, like setting various 2465 * vcpu->arch fields. 2466 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2467 * is assigned to entry_failure_code on failure. 2468 */ 2469 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2470 bool from_vmentry, 2471 enum vm_entry_failure_code *entry_failure_code) 2472 { 2473 struct vcpu_vmx *vmx = to_vmx(vcpu); 2474 bool load_guest_pdptrs_vmcs12 = false; 2475 2476 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 2477 prepare_vmcs02_rare(vmx, vmcs12); 2478 vmx->nested.dirty_vmcs12 = false; 2479 2480 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || 2481 !(vmx->nested.hv_evmcs->hv_clean_fields & 2482 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2483 } 2484 2485 if (vmx->nested.nested_run_pending && 2486 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2487 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2488 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2489 } else { 2490 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2491 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2492 } 2493 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2494 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2495 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2496 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2497 2498 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2499 * bitwise-or of what L1 wants to trap for L2, and what we want to 2500 * trap. Note that CR0.TS also needs updating - we do this later. 2501 */ 2502 vmx_update_exception_bitmap(vcpu); 2503 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2504 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2505 2506 if (vmx->nested.nested_run_pending && 2507 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2508 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2509 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2510 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2511 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2512 } 2513 2514 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2515 vcpu->arch.l1_tsc_offset, 2516 vmx_get_l2_tsc_offset(vcpu), 2517 vmx_get_l2_tsc_multiplier(vcpu)); 2518 2519 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2520 vcpu->arch.l1_tsc_scaling_ratio, 2521 vmx_get_l2_tsc_multiplier(vcpu)); 2522 2523 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2524 if (kvm_has_tsc_control) 2525 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2526 2527 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2528 2529 if (nested_cpu_has_ept(vmcs12)) 2530 nested_ept_init_mmu_context(vcpu); 2531 2532 /* 2533 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2534 * bits which we consider mandatory enabled. 2535 * The CR0_READ_SHADOW is what L2 should have expected to read given 2536 * the specifications by L1; It's not enough to take 2537 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2538 * have more bits than L1 expected. 2539 */ 2540 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2541 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2542 2543 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2544 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2545 2546 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2547 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2548 vmx_set_efer(vcpu, vcpu->arch.efer); 2549 2550 /* 2551 * Guest state is invalid and unrestricted guest is disabled, 2552 * which means L1 attempted VMEntry to L2 with invalid state. 2553 * Fail the VMEntry. 2554 * 2555 * However when force loading the guest state (SMM exit or 2556 * loading nested state after migration, it is possible to 2557 * have invalid guest state now, which will be later fixed by 2558 * restoring L2 register state 2559 */ 2560 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2561 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2562 return -EINVAL; 2563 } 2564 2565 /* Shadow page tables on either EPT or shadow page tables. */ 2566 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2567 from_vmentry, entry_failure_code)) 2568 return -EINVAL; 2569 2570 /* 2571 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2572 * on nested VM-Exit, which can occur without actually running L2 and 2573 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2574 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2575 * transition to HLT instead of running L2. 2576 */ 2577 if (enable_ept) 2578 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2579 2580 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2581 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2582 is_pae_paging(vcpu)) { 2583 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2584 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2585 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2586 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2587 } 2588 2589 if (!enable_ept) 2590 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2591 2592 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2593 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2594 vmcs12->guest_ia32_perf_global_ctrl))) 2595 return -EINVAL; 2596 2597 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2598 kvm_rip_write(vcpu, vmcs12->guest_rip); 2599 2600 /* 2601 * It was observed that genuine Hyper-V running in L1 doesn't reset 2602 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2603 * bits when it changes a field in eVMCS. Mark all fields as clean 2604 * here. 2605 */ 2606 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2607 vmx->nested.hv_evmcs->hv_clean_fields |= 2608 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2609 2610 return 0; 2611 } 2612 2613 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2614 { 2615 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2616 nested_cpu_has_virtual_nmis(vmcs12))) 2617 return -EINVAL; 2618 2619 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2620 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2621 return -EINVAL; 2622 2623 return 0; 2624 } 2625 2626 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2627 { 2628 struct vcpu_vmx *vmx = to_vmx(vcpu); 2629 2630 /* Check for memory type validity */ 2631 switch (new_eptp & VMX_EPTP_MT_MASK) { 2632 case VMX_EPTP_MT_UC: 2633 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2634 return false; 2635 break; 2636 case VMX_EPTP_MT_WB: 2637 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2638 return false; 2639 break; 2640 default: 2641 return false; 2642 } 2643 2644 /* Page-walk levels validity. */ 2645 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2646 case VMX_EPTP_PWL_5: 2647 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2648 return false; 2649 break; 2650 case VMX_EPTP_PWL_4: 2651 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2652 return false; 2653 break; 2654 default: 2655 return false; 2656 } 2657 2658 /* Reserved bits should not be set */ 2659 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2660 return false; 2661 2662 /* AD, if set, should be supported */ 2663 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2664 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2665 return false; 2666 } 2667 2668 return true; 2669 } 2670 2671 /* 2672 * Checks related to VM-Execution Control Fields 2673 */ 2674 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2675 struct vmcs12 *vmcs12) 2676 { 2677 struct vcpu_vmx *vmx = to_vmx(vcpu); 2678 2679 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2680 vmx->nested.msrs.pinbased_ctls_low, 2681 vmx->nested.msrs.pinbased_ctls_high)) || 2682 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2683 vmx->nested.msrs.procbased_ctls_low, 2684 vmx->nested.msrs.procbased_ctls_high))) 2685 return -EINVAL; 2686 2687 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2688 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2689 vmx->nested.msrs.secondary_ctls_low, 2690 vmx->nested.msrs.secondary_ctls_high))) 2691 return -EINVAL; 2692 2693 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2694 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2695 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2696 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2697 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2698 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2699 nested_vmx_check_nmi_controls(vmcs12) || 2700 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2701 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2702 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2703 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2704 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2705 return -EINVAL; 2706 2707 if (!nested_cpu_has_preemption_timer(vmcs12) && 2708 nested_cpu_has_save_preemption_timer(vmcs12)) 2709 return -EINVAL; 2710 2711 if (nested_cpu_has_ept(vmcs12) && 2712 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2713 return -EINVAL; 2714 2715 if (nested_cpu_has_vmfunc(vmcs12)) { 2716 if (CC(vmcs12->vm_function_control & 2717 ~vmx->nested.msrs.vmfunc_controls)) 2718 return -EINVAL; 2719 2720 if (nested_cpu_has_eptp_switching(vmcs12)) { 2721 if (CC(!nested_cpu_has_ept(vmcs12)) || 2722 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2723 return -EINVAL; 2724 } 2725 } 2726 2727 return 0; 2728 } 2729 2730 /* 2731 * Checks related to VM-Exit Control Fields 2732 */ 2733 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2734 struct vmcs12 *vmcs12) 2735 { 2736 struct vcpu_vmx *vmx = to_vmx(vcpu); 2737 2738 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2739 vmx->nested.msrs.exit_ctls_low, 2740 vmx->nested.msrs.exit_ctls_high)) || 2741 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2742 return -EINVAL; 2743 2744 return 0; 2745 } 2746 2747 /* 2748 * Checks related to VM-Entry Control Fields 2749 */ 2750 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2751 struct vmcs12 *vmcs12) 2752 { 2753 struct vcpu_vmx *vmx = to_vmx(vcpu); 2754 2755 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2756 vmx->nested.msrs.entry_ctls_low, 2757 vmx->nested.msrs.entry_ctls_high))) 2758 return -EINVAL; 2759 2760 /* 2761 * From the Intel SDM, volume 3: 2762 * Fields relevant to VM-entry event injection must be set properly. 2763 * These fields are the VM-entry interruption-information field, the 2764 * VM-entry exception error code, and the VM-entry instruction length. 2765 */ 2766 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2767 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2768 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2769 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2770 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2771 bool should_have_error_code; 2772 bool urg = nested_cpu_has2(vmcs12, 2773 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2774 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2775 2776 /* VM-entry interruption-info field: interruption type */ 2777 if (CC(intr_type == INTR_TYPE_RESERVED) || 2778 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2779 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2780 return -EINVAL; 2781 2782 /* VM-entry interruption-info field: vector */ 2783 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2784 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2785 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2786 return -EINVAL; 2787 2788 /* VM-entry interruption-info field: deliver error code */ 2789 should_have_error_code = 2790 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2791 x86_exception_has_error_code(vector); 2792 if (CC(has_error_code != should_have_error_code)) 2793 return -EINVAL; 2794 2795 /* VM-entry exception error code */ 2796 if (CC(has_error_code && 2797 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2798 return -EINVAL; 2799 2800 /* VM-entry interruption-info field: reserved bits */ 2801 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2802 return -EINVAL; 2803 2804 /* VM-entry instruction length */ 2805 switch (intr_type) { 2806 case INTR_TYPE_SOFT_EXCEPTION: 2807 case INTR_TYPE_SOFT_INTR: 2808 case INTR_TYPE_PRIV_SW_EXCEPTION: 2809 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2810 CC(vmcs12->vm_entry_instruction_len == 0 && 2811 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2812 return -EINVAL; 2813 } 2814 } 2815 2816 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2817 return -EINVAL; 2818 2819 return 0; 2820 } 2821 2822 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2823 struct vmcs12 *vmcs12) 2824 { 2825 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2826 nested_check_vm_exit_controls(vcpu, vmcs12) || 2827 nested_check_vm_entry_controls(vcpu, vmcs12)) 2828 return -EINVAL; 2829 2830 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) 2831 return nested_evmcs_check_controls(vmcs12); 2832 2833 return 0; 2834 } 2835 2836 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2837 struct vmcs12 *vmcs12) 2838 { 2839 #ifdef CONFIG_X86_64 2840 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2841 !!(vcpu->arch.efer & EFER_LMA))) 2842 return -EINVAL; 2843 #endif 2844 return 0; 2845 } 2846 2847 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2848 struct vmcs12 *vmcs12) 2849 { 2850 bool ia32e; 2851 2852 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2853 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2854 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2855 return -EINVAL; 2856 2857 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2858 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2859 return -EINVAL; 2860 2861 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2862 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2863 return -EINVAL; 2864 2865 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2866 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2867 vmcs12->host_ia32_perf_global_ctrl))) 2868 return -EINVAL; 2869 2870 #ifdef CONFIG_X86_64 2871 ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2872 #else 2873 ia32e = false; 2874 #endif 2875 2876 if (ia32e) { 2877 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2878 return -EINVAL; 2879 } else { 2880 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2881 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2882 CC((vmcs12->host_rip) >> 32)) 2883 return -EINVAL; 2884 } 2885 2886 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2887 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2888 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2889 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2890 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2891 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2892 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2893 CC(vmcs12->host_cs_selector == 0) || 2894 CC(vmcs12->host_tr_selector == 0) || 2895 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2896 return -EINVAL; 2897 2898 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2899 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2900 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2901 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2902 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2903 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2904 return -EINVAL; 2905 2906 /* 2907 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2908 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2909 * the values of the LMA and LME bits in the field must each be that of 2910 * the host address-space size VM-exit control. 2911 */ 2912 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2913 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2914 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2915 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2916 return -EINVAL; 2917 } 2918 2919 return 0; 2920 } 2921 2922 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2923 struct vmcs12 *vmcs12) 2924 { 2925 struct vcpu_vmx *vmx = to_vmx(vcpu); 2926 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 2927 struct vmcs_hdr hdr; 2928 2929 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 2930 return 0; 2931 2932 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2933 return -EINVAL; 2934 2935 if (ghc->gpa != vmcs12->vmcs_link_pointer && 2936 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 2937 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 2938 return -EINVAL; 2939 2940 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 2941 offsetof(struct vmcs12, hdr), 2942 sizeof(hdr)))) 2943 return -EINVAL; 2944 2945 if (CC(hdr.revision_id != VMCS12_REVISION) || 2946 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2947 return -EINVAL; 2948 2949 return 0; 2950 } 2951 2952 /* 2953 * Checks related to Guest Non-register State 2954 */ 2955 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2956 { 2957 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2958 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 2959 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 2960 return -EINVAL; 2961 2962 return 0; 2963 } 2964 2965 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2966 struct vmcs12 *vmcs12, 2967 enum vm_entry_failure_code *entry_failure_code) 2968 { 2969 bool ia32e; 2970 2971 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2972 2973 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2974 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2975 return -EINVAL; 2976 2977 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 2978 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 2979 return -EINVAL; 2980 2981 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2982 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2983 return -EINVAL; 2984 2985 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2986 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 2987 return -EINVAL; 2988 } 2989 2990 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2991 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2992 vmcs12->guest_ia32_perf_global_ctrl))) 2993 return -EINVAL; 2994 2995 /* 2996 * If the load IA32_EFER VM-entry control is 1, the following checks 2997 * are performed on the field for the IA32_EFER MSR: 2998 * - Bits reserved in the IA32_EFER MSR must be 0. 2999 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3000 * the IA-32e mode guest VM-exit control. It must also be identical 3001 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3002 * CR0.PG) is 1. 3003 */ 3004 if (to_vmx(vcpu)->nested.nested_run_pending && 3005 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3006 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 3007 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3008 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3009 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3010 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3011 return -EINVAL; 3012 } 3013 3014 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3015 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3016 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3017 return -EINVAL; 3018 3019 if (nested_check_guest_non_reg_state(vmcs12)) 3020 return -EINVAL; 3021 3022 return 0; 3023 } 3024 3025 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3026 { 3027 struct vcpu_vmx *vmx = to_vmx(vcpu); 3028 unsigned long cr3, cr4; 3029 bool vm_fail; 3030 3031 if (!nested_early_check) 3032 return 0; 3033 3034 if (vmx->msr_autoload.host.nr) 3035 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3036 if (vmx->msr_autoload.guest.nr) 3037 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3038 3039 preempt_disable(); 3040 3041 vmx_prepare_switch_to_guest(vcpu); 3042 3043 /* 3044 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3045 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3046 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3047 * there is no need to preserve other bits or save/restore the field. 3048 */ 3049 vmcs_writel(GUEST_RFLAGS, 0); 3050 3051 cr3 = __get_current_cr3_fast(); 3052 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3053 vmcs_writel(HOST_CR3, cr3); 3054 vmx->loaded_vmcs->host_state.cr3 = cr3; 3055 } 3056 3057 cr4 = cr4_read_shadow(); 3058 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3059 vmcs_writel(HOST_CR4, cr4); 3060 vmx->loaded_vmcs->host_state.cr4 = cr4; 3061 } 3062 3063 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3064 vmx->loaded_vmcs->launched); 3065 3066 if (vmx->msr_autoload.host.nr) 3067 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3068 if (vmx->msr_autoload.guest.nr) 3069 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3070 3071 if (vm_fail) { 3072 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3073 3074 preempt_enable(); 3075 3076 trace_kvm_nested_vmenter_failed( 3077 "early hardware check VM-instruction error: ", error); 3078 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3079 return 1; 3080 } 3081 3082 /* 3083 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3084 */ 3085 if (hw_breakpoint_active()) 3086 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3087 local_irq_enable(); 3088 preempt_enable(); 3089 3090 /* 3091 * A non-failing VMEntry means we somehow entered guest mode with 3092 * an illegal RIP, and that's just the tip of the iceberg. There 3093 * is no telling what memory has been modified or what state has 3094 * been exposed to unknown code. Hitting this all but guarantees 3095 * a (very critical) hardware issue. 3096 */ 3097 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3098 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3099 3100 return 0; 3101 } 3102 3103 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3104 { 3105 struct vcpu_vmx *vmx = to_vmx(vcpu); 3106 3107 /* 3108 * hv_evmcs may end up being not mapped after migration (when 3109 * L2 was running), map it here to make sure vmcs12 changes are 3110 * properly reflected. 3111 */ 3112 if (vmx->nested.enlightened_vmcs_enabled && 3113 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3114 enum nested_evmptrld_status evmptrld_status = 3115 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3116 3117 if (evmptrld_status == EVMPTRLD_VMFAIL || 3118 evmptrld_status == EVMPTRLD_ERROR) 3119 return false; 3120 3121 /* 3122 * Post migration VMCS12 always provides the most actual 3123 * information, copy it to eVMCS upon entry. 3124 */ 3125 vmx->nested.need_vmcs12_to_shadow_sync = true; 3126 } 3127 3128 return true; 3129 } 3130 3131 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3132 { 3133 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3134 struct vcpu_vmx *vmx = to_vmx(vcpu); 3135 struct kvm_host_map *map; 3136 struct page *page; 3137 u64 hpa; 3138 3139 if (!vcpu->arch.pdptrs_from_userspace && 3140 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3141 /* 3142 * Reload the guest's PDPTRs since after a migration 3143 * the guest CR3 might be restored prior to setting the nested 3144 * state which can lead to a load of wrong PDPTRs. 3145 */ 3146 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) 3147 return false; 3148 } 3149 3150 3151 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3152 /* 3153 * Translate L1 physical address to host physical 3154 * address for vmcs02. Keep the page pinned, so this 3155 * physical address remains valid. We keep a reference 3156 * to it so we can release it later. 3157 */ 3158 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 3159 kvm_release_page_clean(vmx->nested.apic_access_page); 3160 vmx->nested.apic_access_page = NULL; 3161 } 3162 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 3163 if (!is_error_page(page)) { 3164 vmx->nested.apic_access_page = page; 3165 hpa = page_to_phys(vmx->nested.apic_access_page); 3166 vmcs_write64(APIC_ACCESS_ADDR, hpa); 3167 } else { 3168 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 3169 __func__); 3170 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3171 vcpu->run->internal.suberror = 3172 KVM_INTERNAL_ERROR_EMULATION; 3173 vcpu->run->internal.ndata = 0; 3174 return false; 3175 } 3176 } 3177 3178 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3179 map = &vmx->nested.virtual_apic_map; 3180 3181 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3182 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3183 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3184 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3185 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3186 /* 3187 * The processor will never use the TPR shadow, simply 3188 * clear the bit from the execution control. Such a 3189 * configuration is useless, but it happens in tests. 3190 * For any other configuration, failing the vm entry is 3191 * _not_ what the processor does but it's basically the 3192 * only possibility we have. 3193 */ 3194 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3195 } else { 3196 /* 3197 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3198 * force VM-Entry to fail. 3199 */ 3200 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3201 } 3202 } 3203 3204 if (nested_cpu_has_posted_intr(vmcs12)) { 3205 map = &vmx->nested.pi_desc_map; 3206 3207 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3208 vmx->nested.pi_desc = 3209 (struct pi_desc *)(((void *)map->hva) + 3210 offset_in_page(vmcs12->posted_intr_desc_addr)); 3211 vmcs_write64(POSTED_INTR_DESC_ADDR, 3212 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3213 } else { 3214 /* 3215 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3216 * access the contents of the VMCS12 posted interrupt 3217 * descriptor. (Note that KVM may do this when it 3218 * should not, per the architectural specification.) 3219 */ 3220 vmx->nested.pi_desc = NULL; 3221 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3222 } 3223 } 3224 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3225 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3226 else 3227 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3228 3229 return true; 3230 } 3231 3232 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3233 { 3234 if (!nested_get_evmcs_page(vcpu)) { 3235 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3236 __func__); 3237 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3238 vcpu->run->internal.suberror = 3239 KVM_INTERNAL_ERROR_EMULATION; 3240 vcpu->run->internal.ndata = 0; 3241 3242 return false; 3243 } 3244 3245 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3246 return false; 3247 3248 return true; 3249 } 3250 3251 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3252 { 3253 struct vmcs12 *vmcs12; 3254 struct vcpu_vmx *vmx = to_vmx(vcpu); 3255 gpa_t dst; 3256 3257 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3258 return 0; 3259 3260 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3261 return 1; 3262 3263 /* 3264 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3265 * set is already checked as part of A/D emulation. 3266 */ 3267 vmcs12 = get_vmcs12(vcpu); 3268 if (!nested_cpu_has_pml(vmcs12)) 3269 return 0; 3270 3271 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3272 vmx->nested.pml_full = true; 3273 return 1; 3274 } 3275 3276 gpa &= ~0xFFFull; 3277 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3278 3279 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3280 offset_in_page(dst), sizeof(gpa))) 3281 return 0; 3282 3283 vmcs12->guest_pml_index--; 3284 3285 return 0; 3286 } 3287 3288 /* 3289 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3290 * for running VMX instructions (except VMXON, whose prerequisites are 3291 * slightly different). It also specifies what exception to inject otherwise. 3292 * Note that many of these exceptions have priority over VM exits, so they 3293 * don't have to be checked again here. 3294 */ 3295 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3296 { 3297 if (!to_vmx(vcpu)->nested.vmxon) { 3298 kvm_queue_exception(vcpu, UD_VECTOR); 3299 return 0; 3300 } 3301 3302 if (vmx_get_cpl(vcpu)) { 3303 kvm_inject_gp(vcpu, 0); 3304 return 0; 3305 } 3306 3307 return 1; 3308 } 3309 3310 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3311 { 3312 u8 rvi = vmx_get_rvi(); 3313 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3314 3315 return ((rvi & 0xf0) > (vppr & 0xf0)); 3316 } 3317 3318 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3319 struct vmcs12 *vmcs12); 3320 3321 /* 3322 * If from_vmentry is false, this is being called from state restore (either RSM 3323 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3324 * 3325 * Returns: 3326 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3327 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3328 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3329 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3330 */ 3331 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3332 bool from_vmentry) 3333 { 3334 struct vcpu_vmx *vmx = to_vmx(vcpu); 3335 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3336 enum vm_entry_failure_code entry_failure_code; 3337 bool evaluate_pending_interrupts; 3338 union vmx_exit_reason exit_reason = { 3339 .basic = EXIT_REASON_INVALID_STATE, 3340 .failed_vmentry = 1, 3341 }; 3342 u32 failed_index; 3343 3344 kvm_service_local_tlb_flush_requests(vcpu); 3345 3346 evaluate_pending_interrupts = exec_controls_get(vmx) & 3347 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3348 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3349 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3350 3351 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3352 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3353 if (kvm_mpx_supported() && 3354 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3355 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3356 3357 /* 3358 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3359 * nested early checks are disabled. In the event of a "late" VM-Fail, 3360 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3361 * software model to the pre-VMEntry host state. When EPT is disabled, 3362 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3363 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3364 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3365 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3366 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3367 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3368 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3369 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3370 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3371 * path would need to manually save/restore vmcs01.GUEST_CR3. 3372 */ 3373 if (!enable_ept && !nested_early_check) 3374 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3375 3376 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3377 3378 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3379 3380 if (from_vmentry) { 3381 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3382 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3383 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3384 } 3385 3386 if (nested_vmx_check_vmentry_hw(vcpu)) { 3387 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3388 return NVMX_VMENTRY_VMFAIL; 3389 } 3390 3391 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3392 &entry_failure_code)) { 3393 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3394 vmcs12->exit_qualification = entry_failure_code; 3395 goto vmentry_fail_vmexit; 3396 } 3397 } 3398 3399 enter_guest_mode(vcpu); 3400 3401 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3402 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3403 vmcs12->exit_qualification = entry_failure_code; 3404 goto vmentry_fail_vmexit_guest_mode; 3405 } 3406 3407 if (from_vmentry) { 3408 failed_index = nested_vmx_load_msr(vcpu, 3409 vmcs12->vm_entry_msr_load_addr, 3410 vmcs12->vm_entry_msr_load_count); 3411 if (failed_index) { 3412 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3413 vmcs12->exit_qualification = failed_index; 3414 goto vmentry_fail_vmexit_guest_mode; 3415 } 3416 } else { 3417 /* 3418 * The MMU is not initialized to point at the right entities yet and 3419 * "get pages" would need to read data from the guest (i.e. we will 3420 * need to perform gpa to hpa translation). Request a call 3421 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3422 * have already been set at vmentry time and should not be reset. 3423 */ 3424 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3425 } 3426 3427 /* 3428 * If L1 had a pending IRQ/NMI until it executed 3429 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3430 * disallowed (e.g. interrupts disabled), L0 needs to 3431 * evaluate if this pending event should cause an exit from L2 3432 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3433 * intercept EXTERNAL_INTERRUPT). 3434 * 3435 * Usually this would be handled by the processor noticing an 3436 * IRQ/NMI window request, or checking RVI during evaluation of 3437 * pending virtual interrupts. However, this setting was done 3438 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3439 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3440 */ 3441 if (unlikely(evaluate_pending_interrupts)) 3442 kvm_make_request(KVM_REQ_EVENT, vcpu); 3443 3444 /* 3445 * Do not start the preemption timer hrtimer until after we know 3446 * we are successful, so that only nested_vmx_vmexit needs to cancel 3447 * the timer. 3448 */ 3449 vmx->nested.preemption_timer_expired = false; 3450 if (nested_cpu_has_preemption_timer(vmcs12)) { 3451 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3452 vmx_start_preemption_timer(vcpu, timer_value); 3453 } 3454 3455 /* 3456 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3457 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3458 * returned as far as L1 is concerned. It will only return (and set 3459 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3460 */ 3461 return NVMX_VMENTRY_SUCCESS; 3462 3463 /* 3464 * A failed consistency check that leads to a VMExit during L1's 3465 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3466 * 26.7 "VM-entry failures during or after loading guest state". 3467 */ 3468 vmentry_fail_vmexit_guest_mode: 3469 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3470 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3471 leave_guest_mode(vcpu); 3472 3473 vmentry_fail_vmexit: 3474 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3475 3476 if (!from_vmentry) 3477 return NVMX_VMENTRY_VMEXIT; 3478 3479 load_vmcs12_host_state(vcpu, vmcs12); 3480 vmcs12->vm_exit_reason = exit_reason.full; 3481 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 3482 vmx->nested.need_vmcs12_to_shadow_sync = true; 3483 return NVMX_VMENTRY_VMEXIT; 3484 } 3485 3486 /* 3487 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3488 * for running an L2 nested guest. 3489 */ 3490 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3491 { 3492 struct vmcs12 *vmcs12; 3493 enum nvmx_vmentry_status status; 3494 struct vcpu_vmx *vmx = to_vmx(vcpu); 3495 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3496 enum nested_evmptrld_status evmptrld_status; 3497 3498 if (!nested_vmx_check_permission(vcpu)) 3499 return 1; 3500 3501 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3502 if (evmptrld_status == EVMPTRLD_ERROR) { 3503 kvm_queue_exception(vcpu, UD_VECTOR); 3504 return 1; 3505 } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { 3506 return nested_vmx_failInvalid(vcpu); 3507 } 3508 3509 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && 3510 vmx->nested.current_vmptr == INVALID_GPA)) 3511 return nested_vmx_failInvalid(vcpu); 3512 3513 vmcs12 = get_vmcs12(vcpu); 3514 3515 /* 3516 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3517 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3518 * rather than RFLAGS.ZF, and no error number is stored to the 3519 * VM-instruction error field. 3520 */ 3521 if (CC(vmcs12->hdr.shadow_vmcs)) 3522 return nested_vmx_failInvalid(vcpu); 3523 3524 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 3525 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); 3526 /* Enlightened VMCS doesn't have launch state */ 3527 vmcs12->launch_state = !launch; 3528 } else if (enable_shadow_vmcs) { 3529 copy_shadow_to_vmcs12(vmx); 3530 } 3531 3532 /* 3533 * The nested entry process starts with enforcing various prerequisites 3534 * on vmcs12 as required by the Intel SDM, and act appropriately when 3535 * they fail: As the SDM explains, some conditions should cause the 3536 * instruction to fail, while others will cause the instruction to seem 3537 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3538 * To speed up the normal (success) code path, we should avoid checking 3539 * for misconfigurations which will anyway be caught by the processor 3540 * when using the merged vmcs02. 3541 */ 3542 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3543 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3544 3545 if (CC(vmcs12->launch_state == launch)) 3546 return nested_vmx_fail(vcpu, 3547 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3548 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3549 3550 if (nested_vmx_check_controls(vcpu, vmcs12)) 3551 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3552 3553 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3554 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3555 3556 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3557 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3558 3559 /* 3560 * We're finally done with prerequisite checking, and can start with 3561 * the nested entry. 3562 */ 3563 vmx->nested.nested_run_pending = 1; 3564 vmx->nested.has_preemption_timer_deadline = false; 3565 status = nested_vmx_enter_non_root_mode(vcpu, true); 3566 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3567 goto vmentry_failed; 3568 3569 /* Emulate processing of posted interrupts on VM-Enter. */ 3570 if (nested_cpu_has_posted_intr(vmcs12) && 3571 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3572 vmx->nested.pi_pending = true; 3573 kvm_make_request(KVM_REQ_EVENT, vcpu); 3574 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3575 } 3576 3577 /* Hide L1D cache contents from the nested guest. */ 3578 vmx->vcpu.arch.l1tf_flush_l1d = true; 3579 3580 /* 3581 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3582 * also be used as part of restoring nVMX state for 3583 * snapshot restore (migration). 3584 * 3585 * In this flow, it is assumed that vmcs12 cache was 3586 * transferred as part of captured nVMX state and should 3587 * therefore not be read from guest memory (which may not 3588 * exist on destination host yet). 3589 */ 3590 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3591 3592 switch (vmcs12->guest_activity_state) { 3593 case GUEST_ACTIVITY_HLT: 3594 /* 3595 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3596 * awakened by event injection or by an NMI-window VM-exit or 3597 * by an interrupt-window VM-exit, halt the vcpu. 3598 */ 3599 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3600 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3601 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3602 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3603 vmx->nested.nested_run_pending = 0; 3604 return kvm_vcpu_halt(vcpu); 3605 } 3606 break; 3607 case GUEST_ACTIVITY_WAIT_SIPI: 3608 vmx->nested.nested_run_pending = 0; 3609 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3610 break; 3611 default: 3612 break; 3613 } 3614 3615 return 1; 3616 3617 vmentry_failed: 3618 vmx->nested.nested_run_pending = 0; 3619 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3620 return 0; 3621 if (status == NVMX_VMENTRY_VMEXIT) 3622 return 1; 3623 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3624 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3625 } 3626 3627 /* 3628 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3629 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3630 * This function returns the new value we should put in vmcs12.guest_cr0. 3631 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3632 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3633 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3634 * didn't trap the bit, because if L1 did, so would L0). 3635 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3636 * been modified by L2, and L1 knows it. So just leave the old value of 3637 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3638 * isn't relevant, because if L0 traps this bit it can set it to anything. 3639 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3640 * changed these bits, and therefore they need to be updated, but L0 3641 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3642 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3643 */ 3644 static inline unsigned long 3645 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3646 { 3647 return 3648 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3649 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3650 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3651 vcpu->arch.cr0_guest_owned_bits)); 3652 } 3653 3654 static inline unsigned long 3655 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3656 { 3657 return 3658 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3659 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3660 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3661 vcpu->arch.cr4_guest_owned_bits)); 3662 } 3663 3664 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3665 struct vmcs12 *vmcs12) 3666 { 3667 u32 idt_vectoring; 3668 unsigned int nr; 3669 3670 if (vcpu->arch.exception.injected) { 3671 nr = vcpu->arch.exception.nr; 3672 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3673 3674 if (kvm_exception_is_soft(nr)) { 3675 vmcs12->vm_exit_instruction_len = 3676 vcpu->arch.event_exit_inst_len; 3677 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3678 } else 3679 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3680 3681 if (vcpu->arch.exception.has_error_code) { 3682 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3683 vmcs12->idt_vectoring_error_code = 3684 vcpu->arch.exception.error_code; 3685 } 3686 3687 vmcs12->idt_vectoring_info_field = idt_vectoring; 3688 } else if (vcpu->arch.nmi_injected) { 3689 vmcs12->idt_vectoring_info_field = 3690 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3691 } else if (vcpu->arch.interrupt.injected) { 3692 nr = vcpu->arch.interrupt.nr; 3693 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3694 3695 if (vcpu->arch.interrupt.soft) { 3696 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3697 vmcs12->vm_entry_instruction_len = 3698 vcpu->arch.event_exit_inst_len; 3699 } else 3700 idt_vectoring |= INTR_TYPE_EXT_INTR; 3701 3702 vmcs12->idt_vectoring_info_field = idt_vectoring; 3703 } 3704 } 3705 3706 3707 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3708 { 3709 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3710 gfn_t gfn; 3711 3712 /* 3713 * Don't need to mark the APIC access page dirty; it is never 3714 * written to by the CPU during APIC virtualization. 3715 */ 3716 3717 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3718 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3719 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3720 } 3721 3722 if (nested_cpu_has_posted_intr(vmcs12)) { 3723 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3724 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3725 } 3726 } 3727 3728 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3729 { 3730 struct vcpu_vmx *vmx = to_vmx(vcpu); 3731 int max_irr; 3732 void *vapic_page; 3733 u16 status; 3734 3735 if (!vmx->nested.pi_pending) 3736 return 0; 3737 3738 if (!vmx->nested.pi_desc) 3739 goto mmio_needed; 3740 3741 vmx->nested.pi_pending = false; 3742 3743 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3744 return 0; 3745 3746 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3747 if (max_irr != 256) { 3748 vapic_page = vmx->nested.virtual_apic_map.hva; 3749 if (!vapic_page) 3750 goto mmio_needed; 3751 3752 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3753 vapic_page, &max_irr); 3754 status = vmcs_read16(GUEST_INTR_STATUS); 3755 if ((u8)max_irr > ((u8)status & 0xff)) { 3756 status &= ~0xff; 3757 status |= (u8)max_irr; 3758 vmcs_write16(GUEST_INTR_STATUS, status); 3759 } 3760 } 3761 3762 nested_mark_vmcs12_pages_dirty(vcpu); 3763 return 0; 3764 3765 mmio_needed: 3766 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3767 return -ENXIO; 3768 } 3769 3770 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3771 unsigned long exit_qual) 3772 { 3773 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3774 unsigned int nr = vcpu->arch.exception.nr; 3775 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3776 3777 if (vcpu->arch.exception.has_error_code) { 3778 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3779 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3780 } 3781 3782 if (kvm_exception_is_soft(nr)) 3783 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3784 else 3785 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3786 3787 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3788 vmx_get_nmi_mask(vcpu)) 3789 intr_info |= INTR_INFO_UNBLOCK_NMI; 3790 3791 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3792 } 3793 3794 /* 3795 * Returns true if a debug trap is pending delivery. 3796 * 3797 * In KVM, debug traps bear an exception payload. As such, the class of a #DB 3798 * exception may be inferred from the presence of an exception payload. 3799 */ 3800 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) 3801 { 3802 return vcpu->arch.exception.pending && 3803 vcpu->arch.exception.nr == DB_VECTOR && 3804 vcpu->arch.exception.payload; 3805 } 3806 3807 /* 3808 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3809 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3810 * represents these debug traps with a payload that is said to be compatible 3811 * with the 'pending debug exceptions' field, write the payload to the VMCS 3812 * field if a VM-exit is delivered before the debug trap. 3813 */ 3814 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3815 { 3816 if (vmx_pending_dbg_trap(vcpu)) 3817 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 3818 vcpu->arch.exception.payload); 3819 } 3820 3821 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3822 { 3823 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3824 to_vmx(vcpu)->nested.preemption_timer_expired; 3825 } 3826 3827 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 3828 { 3829 struct vcpu_vmx *vmx = to_vmx(vcpu); 3830 unsigned long exit_qual; 3831 bool block_nested_events = 3832 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3833 bool mtf_pending = vmx->nested.mtf_pending; 3834 struct kvm_lapic *apic = vcpu->arch.apic; 3835 3836 /* 3837 * Clear the MTF state. If a higher priority VM-exit is delivered first, 3838 * this state is discarded. 3839 */ 3840 if (!block_nested_events) 3841 vmx->nested.mtf_pending = false; 3842 3843 if (lapic_in_kernel(vcpu) && 3844 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3845 if (block_nested_events) 3846 return -EBUSY; 3847 nested_vmx_update_pending_dbg(vcpu); 3848 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3849 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 3850 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3851 return 0; 3852 } 3853 3854 if (lapic_in_kernel(vcpu) && 3855 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 3856 if (block_nested_events) 3857 return -EBUSY; 3858 3859 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 3860 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 3861 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 3862 apic->sipi_vector & 0xFFUL); 3863 return 0; 3864 } 3865 3866 /* 3867 * Process any exceptions that are not debug traps before MTF. 3868 * 3869 * Note that only a pending nested run can block a pending exception. 3870 * Otherwise an injected NMI/interrupt should either be 3871 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, 3872 * while delivering the pending exception. 3873 */ 3874 3875 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { 3876 if (vmx->nested.nested_run_pending) 3877 return -EBUSY; 3878 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3879 goto no_vmexit; 3880 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3881 return 0; 3882 } 3883 3884 if (mtf_pending) { 3885 if (block_nested_events) 3886 return -EBUSY; 3887 nested_vmx_update_pending_dbg(vcpu); 3888 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 3889 return 0; 3890 } 3891 3892 if (vcpu->arch.exception.pending) { 3893 if (vmx->nested.nested_run_pending) 3894 return -EBUSY; 3895 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3896 goto no_vmexit; 3897 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3898 return 0; 3899 } 3900 3901 if (nested_vmx_preemption_timer_pending(vcpu)) { 3902 if (block_nested_events) 3903 return -EBUSY; 3904 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3905 return 0; 3906 } 3907 3908 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 3909 if (block_nested_events) 3910 return -EBUSY; 3911 goto no_vmexit; 3912 } 3913 3914 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 3915 if (block_nested_events) 3916 return -EBUSY; 3917 if (!nested_exit_on_nmi(vcpu)) 3918 goto no_vmexit; 3919 3920 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3921 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3922 INTR_INFO_VALID_MASK, 0); 3923 /* 3924 * The NMI-triggered VM exit counts as injection: 3925 * clear this one and block further NMIs. 3926 */ 3927 vcpu->arch.nmi_pending = 0; 3928 vmx_set_nmi_mask(vcpu, true); 3929 return 0; 3930 } 3931 3932 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 3933 if (block_nested_events) 3934 return -EBUSY; 3935 if (!nested_exit_on_intr(vcpu)) 3936 goto no_vmexit; 3937 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3938 return 0; 3939 } 3940 3941 no_vmexit: 3942 return vmx_complete_nested_posted_interrupt(vcpu); 3943 } 3944 3945 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3946 { 3947 ktime_t remaining = 3948 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3949 u64 value; 3950 3951 if (ktime_to_ns(remaining) <= 0) 3952 return 0; 3953 3954 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3955 do_div(value, 1000000); 3956 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3957 } 3958 3959 static bool is_vmcs12_ext_field(unsigned long field) 3960 { 3961 switch (field) { 3962 case GUEST_ES_SELECTOR: 3963 case GUEST_CS_SELECTOR: 3964 case GUEST_SS_SELECTOR: 3965 case GUEST_DS_SELECTOR: 3966 case GUEST_FS_SELECTOR: 3967 case GUEST_GS_SELECTOR: 3968 case GUEST_LDTR_SELECTOR: 3969 case GUEST_TR_SELECTOR: 3970 case GUEST_ES_LIMIT: 3971 case GUEST_CS_LIMIT: 3972 case GUEST_SS_LIMIT: 3973 case GUEST_DS_LIMIT: 3974 case GUEST_FS_LIMIT: 3975 case GUEST_GS_LIMIT: 3976 case GUEST_LDTR_LIMIT: 3977 case GUEST_TR_LIMIT: 3978 case GUEST_GDTR_LIMIT: 3979 case GUEST_IDTR_LIMIT: 3980 case GUEST_ES_AR_BYTES: 3981 case GUEST_DS_AR_BYTES: 3982 case GUEST_FS_AR_BYTES: 3983 case GUEST_GS_AR_BYTES: 3984 case GUEST_LDTR_AR_BYTES: 3985 case GUEST_TR_AR_BYTES: 3986 case GUEST_ES_BASE: 3987 case GUEST_CS_BASE: 3988 case GUEST_SS_BASE: 3989 case GUEST_DS_BASE: 3990 case GUEST_FS_BASE: 3991 case GUEST_GS_BASE: 3992 case GUEST_LDTR_BASE: 3993 case GUEST_TR_BASE: 3994 case GUEST_GDTR_BASE: 3995 case GUEST_IDTR_BASE: 3996 case GUEST_PENDING_DBG_EXCEPTIONS: 3997 case GUEST_BNDCFGS: 3998 return true; 3999 default: 4000 break; 4001 } 4002 4003 return false; 4004 } 4005 4006 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4007 struct vmcs12 *vmcs12) 4008 { 4009 struct vcpu_vmx *vmx = to_vmx(vcpu); 4010 4011 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4012 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4013 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4014 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4015 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4016 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4017 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4018 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4019 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4020 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4021 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4022 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4023 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4024 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4025 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4026 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4027 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4028 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4029 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4030 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4031 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4032 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4033 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4034 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4035 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4036 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4037 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4038 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4039 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4040 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4041 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4042 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4043 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4044 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4045 vmcs12->guest_pending_dbg_exceptions = 4046 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4047 if (kvm_mpx_supported()) 4048 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 4049 4050 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4051 } 4052 4053 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4054 struct vmcs12 *vmcs12) 4055 { 4056 struct vcpu_vmx *vmx = to_vmx(vcpu); 4057 int cpu; 4058 4059 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4060 return; 4061 4062 4063 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4064 4065 cpu = get_cpu(); 4066 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4067 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4068 4069 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4070 4071 vmx->loaded_vmcs = &vmx->vmcs01; 4072 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4073 put_cpu(); 4074 } 4075 4076 /* 4077 * Update the guest state fields of vmcs12 to reflect changes that 4078 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4079 * VM-entry controls is also updated, since this is really a guest 4080 * state bit.) 4081 */ 4082 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4083 { 4084 struct vcpu_vmx *vmx = to_vmx(vcpu); 4085 4086 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 4087 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4088 4089 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4090 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); 4091 4092 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4093 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4094 4095 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4096 vmcs12->guest_rip = kvm_rip_read(vcpu); 4097 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4098 4099 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4100 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4101 4102 vmcs12->guest_interruptibility_info = 4103 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4104 4105 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4106 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4107 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4108 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4109 else 4110 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4111 4112 if (nested_cpu_has_preemption_timer(vmcs12) && 4113 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4114 !vmx->nested.nested_run_pending) 4115 vmcs12->vmx_preemption_timer_value = 4116 vmx_get_preemption_timer_value(vcpu); 4117 4118 /* 4119 * In some cases (usually, nested EPT), L2 is allowed to change its 4120 * own CR3 without exiting. If it has changed it, we must keep it. 4121 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4122 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4123 * 4124 * Additionally, restore L2's PDPTR to vmcs12. 4125 */ 4126 if (enable_ept) { 4127 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4128 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4129 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4130 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4131 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4132 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4133 } 4134 } 4135 4136 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4137 4138 if (nested_cpu_has_vid(vmcs12)) 4139 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4140 4141 vmcs12->vm_entry_controls = 4142 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4143 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4144 4145 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4146 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4147 4148 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4149 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4150 } 4151 4152 /* 4153 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4154 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4155 * and this function updates it to reflect the changes to the guest state while 4156 * L2 was running (and perhaps made some exits which were handled directly by L0 4157 * without going back to L1), and to reflect the exit reason. 4158 * Note that we do not have to copy here all VMCS fields, just those that 4159 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4160 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4161 * which already writes to vmcs12 directly. 4162 */ 4163 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4164 u32 vm_exit_reason, u32 exit_intr_info, 4165 unsigned long exit_qualification) 4166 { 4167 /* update exit information fields: */ 4168 vmcs12->vm_exit_reason = vm_exit_reason; 4169 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4170 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4171 vmcs12->exit_qualification = exit_qualification; 4172 vmcs12->vm_exit_intr_info = exit_intr_info; 4173 4174 vmcs12->idt_vectoring_info_field = 0; 4175 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4176 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4177 4178 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4179 vmcs12->launch_state = 1; 4180 4181 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4182 * instead of reading the real value. */ 4183 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4184 4185 /* 4186 * Transfer the event that L0 or L1 may wanted to inject into 4187 * L2 to IDT_VECTORING_INFO_FIELD. 4188 */ 4189 vmcs12_save_pending_event(vcpu, vmcs12); 4190 4191 /* 4192 * According to spec, there's no need to store the guest's 4193 * MSRs if the exit is due to a VM-entry failure that occurs 4194 * during or after loading the guest state. Since this exit 4195 * does not fall in that category, we need to save the MSRs. 4196 */ 4197 if (nested_vmx_store_msr(vcpu, 4198 vmcs12->vm_exit_msr_store_addr, 4199 vmcs12->vm_exit_msr_store_count)) 4200 nested_vmx_abort(vcpu, 4201 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4202 } 4203 4204 /* 4205 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 4206 * preserved above and would only end up incorrectly in L1. 4207 */ 4208 vcpu->arch.nmi_injected = false; 4209 kvm_clear_exception_queue(vcpu); 4210 kvm_clear_interrupt_queue(vcpu); 4211 } 4212 4213 /* 4214 * A part of what we need to when the nested L2 guest exits and we want to 4215 * run its L1 parent, is to reset L1's guest state to the host state specified 4216 * in vmcs12. 4217 * This function is to be called not only on normal nested exit, but also on 4218 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4219 * Failures During or After Loading Guest State"). 4220 * This function should be called when the active VMCS is L1's (vmcs01). 4221 */ 4222 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4223 struct vmcs12 *vmcs12) 4224 { 4225 enum vm_entry_failure_code ignored; 4226 struct kvm_segment seg; 4227 4228 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4229 vcpu->arch.efer = vmcs12->host_ia32_efer; 4230 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4231 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4232 else 4233 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4234 vmx_set_efer(vcpu, vcpu->arch.efer); 4235 4236 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4237 kvm_rip_write(vcpu, vmcs12->host_rip); 4238 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4239 vmx_set_interrupt_shadow(vcpu, 0); 4240 4241 /* 4242 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4243 * actually changed, because vmx_set_cr0 refers to efer set above. 4244 * 4245 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4246 * (KVM doesn't change it); 4247 */ 4248 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4249 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4250 4251 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4252 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4253 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4254 4255 nested_ept_uninit_mmu_context(vcpu); 4256 4257 /* 4258 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4259 * couldn't have changed. 4260 */ 4261 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4262 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4263 4264 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4265 4266 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4267 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4268 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4269 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4270 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4271 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4272 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4273 4274 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4275 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4276 vmcs_write64(GUEST_BNDCFGS, 0); 4277 4278 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4279 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4280 vcpu->arch.pat = vmcs12->host_ia32_pat; 4281 } 4282 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 4283 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4284 vmcs12->host_ia32_perf_global_ctrl)); 4285 4286 /* Set L1 segment info according to Intel SDM 4287 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4288 seg = (struct kvm_segment) { 4289 .base = 0, 4290 .limit = 0xFFFFFFFF, 4291 .selector = vmcs12->host_cs_selector, 4292 .type = 11, 4293 .present = 1, 4294 .s = 1, 4295 .g = 1 4296 }; 4297 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4298 seg.l = 1; 4299 else 4300 seg.db = 1; 4301 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4302 seg = (struct kvm_segment) { 4303 .base = 0, 4304 .limit = 0xFFFFFFFF, 4305 .type = 3, 4306 .present = 1, 4307 .s = 1, 4308 .db = 1, 4309 .g = 1 4310 }; 4311 seg.selector = vmcs12->host_ds_selector; 4312 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4313 seg.selector = vmcs12->host_es_selector; 4314 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4315 seg.selector = vmcs12->host_ss_selector; 4316 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4317 seg.selector = vmcs12->host_fs_selector; 4318 seg.base = vmcs12->host_fs_base; 4319 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4320 seg.selector = vmcs12->host_gs_selector; 4321 seg.base = vmcs12->host_gs_base; 4322 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4323 seg = (struct kvm_segment) { 4324 .base = vmcs12->host_tr_base, 4325 .limit = 0x67, 4326 .selector = vmcs12->host_tr_selector, 4327 .type = 11, 4328 .present = 1 4329 }; 4330 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4331 4332 memset(&seg, 0, sizeof(seg)); 4333 seg.unusable = 1; 4334 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4335 4336 kvm_set_dr(vcpu, 7, 0x400); 4337 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4338 4339 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4340 vmcs12->vm_exit_msr_load_count)) 4341 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4342 4343 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4344 } 4345 4346 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4347 { 4348 struct vmx_uret_msr *efer_msr; 4349 unsigned int i; 4350 4351 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4352 return vmcs_read64(GUEST_IA32_EFER); 4353 4354 if (cpu_has_load_ia32_efer()) 4355 return host_efer; 4356 4357 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4358 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4359 return vmx->msr_autoload.guest.val[i].value; 4360 } 4361 4362 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4363 if (efer_msr) 4364 return efer_msr->data; 4365 4366 return host_efer; 4367 } 4368 4369 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4370 { 4371 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4372 struct vcpu_vmx *vmx = to_vmx(vcpu); 4373 struct vmx_msr_entry g, h; 4374 gpa_t gpa; 4375 u32 i, j; 4376 4377 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4378 4379 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4380 /* 4381 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4382 * as vmcs01.GUEST_DR7 contains a userspace defined value 4383 * and vcpu->arch.dr7 is not squirreled away before the 4384 * nested VMENTER (not worth adding a variable in nested_vmx). 4385 */ 4386 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4387 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4388 else 4389 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4390 } 4391 4392 /* 4393 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4394 * handle a variety of side effects to KVM's software model. 4395 */ 4396 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4397 4398 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4399 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4400 4401 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4402 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4403 4404 nested_ept_uninit_mmu_context(vcpu); 4405 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4406 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4407 4408 /* 4409 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4410 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4411 * VMFail, like everything else we just need to ensure our 4412 * software model is up-to-date. 4413 */ 4414 if (enable_ept && is_pae_paging(vcpu)) 4415 ept_save_pdptrs(vcpu); 4416 4417 kvm_mmu_reset_context(vcpu); 4418 4419 /* 4420 * This nasty bit of open coding is a compromise between blindly 4421 * loading L1's MSRs using the exit load lists (incorrect emulation 4422 * of VMFail), leaving the nested VM's MSRs in the software model 4423 * (incorrect behavior) and snapshotting the modified MSRs (too 4424 * expensive since the lists are unbound by hardware). For each 4425 * MSR that was (prematurely) loaded from the nested VMEntry load 4426 * list, reload it from the exit load list if it exists and differs 4427 * from the guest value. The intent is to stuff host state as 4428 * silently as possible, not to fully process the exit load list. 4429 */ 4430 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4431 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4432 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4433 pr_debug_ratelimited( 4434 "%s read MSR index failed (%u, 0x%08llx)\n", 4435 __func__, i, gpa); 4436 goto vmabort; 4437 } 4438 4439 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4440 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4441 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4442 pr_debug_ratelimited( 4443 "%s read MSR failed (%u, 0x%08llx)\n", 4444 __func__, j, gpa); 4445 goto vmabort; 4446 } 4447 if (h.index != g.index) 4448 continue; 4449 if (h.value == g.value) 4450 break; 4451 4452 if (nested_vmx_load_msr_check(vcpu, &h)) { 4453 pr_debug_ratelimited( 4454 "%s check failed (%u, 0x%x, 0x%x)\n", 4455 __func__, j, h.index, h.reserved); 4456 goto vmabort; 4457 } 4458 4459 if (kvm_set_msr(vcpu, h.index, h.value)) { 4460 pr_debug_ratelimited( 4461 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4462 __func__, j, h.index, h.value); 4463 goto vmabort; 4464 } 4465 } 4466 } 4467 4468 return; 4469 4470 vmabort: 4471 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4472 } 4473 4474 /* 4475 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4476 * and modify vmcs12 to make it see what it would expect to see there if 4477 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4478 */ 4479 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4480 u32 exit_intr_info, unsigned long exit_qualification) 4481 { 4482 struct vcpu_vmx *vmx = to_vmx(vcpu); 4483 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4484 4485 /* trying to cancel vmlaunch/vmresume is a bug */ 4486 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4487 4488 /* Similarly, triple faults in L2 should never escape. */ 4489 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); 4490 4491 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4492 /* 4493 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4494 * Enlightened VMCS after migration and we still need to 4495 * do that when something is forcing L2->L1 exit prior to 4496 * the first L2 run. 4497 */ 4498 (void)nested_get_evmcs_page(vcpu); 4499 } 4500 4501 /* Service pending TLB flush requests for L2 before switching to L1. */ 4502 kvm_service_local_tlb_flush_requests(vcpu); 4503 4504 /* 4505 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4506 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4507 * up-to-date before switching to L1. 4508 */ 4509 if (enable_ept && is_pae_paging(vcpu)) 4510 vmx_ept_load_pdptrs(vcpu); 4511 4512 leave_guest_mode(vcpu); 4513 4514 if (nested_cpu_has_preemption_timer(vmcs12)) 4515 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4516 4517 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4518 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4519 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4520 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4521 } 4522 4523 if (likely(!vmx->fail)) { 4524 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4525 4526 if (vm_exit_reason != -1) 4527 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4528 exit_intr_info, exit_qualification); 4529 4530 /* 4531 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4532 * also be used to capture vmcs12 cache as part of 4533 * capturing nVMX state for snapshot (migration). 4534 * 4535 * Otherwise, this flush will dirty guest memory at a 4536 * point it is already assumed by user-space to be 4537 * immutable. 4538 */ 4539 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4540 } else { 4541 /* 4542 * The only expected VM-instruction error is "VM entry with 4543 * invalid control field(s)." Anything else indicates a 4544 * problem with L0. And we should never get here with a 4545 * VMFail of any type if early consistency checks are enabled. 4546 */ 4547 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4548 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4549 WARN_ON_ONCE(nested_early_check); 4550 } 4551 4552 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4553 4554 /* Update any VMCS fields that might have changed while L2 ran */ 4555 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4556 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4557 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4558 if (kvm_has_tsc_control) 4559 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4560 4561 if (vmx->nested.l1_tpr_threshold != -1) 4562 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4563 4564 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4565 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4566 vmx_set_virtual_apic_mode(vcpu); 4567 } 4568 4569 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4570 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4571 vmx_update_cpu_dirty_logging(vcpu); 4572 } 4573 4574 /* Unpin physical memory we referred to in vmcs02 */ 4575 if (vmx->nested.apic_access_page) { 4576 kvm_release_page_clean(vmx->nested.apic_access_page); 4577 vmx->nested.apic_access_page = NULL; 4578 } 4579 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4580 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4581 vmx->nested.pi_desc = NULL; 4582 4583 if (vmx->nested.reload_vmcs01_apic_access_page) { 4584 vmx->nested.reload_vmcs01_apic_access_page = false; 4585 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4586 } 4587 4588 if ((vm_exit_reason != -1) && 4589 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) 4590 vmx->nested.need_vmcs12_to_shadow_sync = true; 4591 4592 /* in case we halted in L2 */ 4593 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4594 4595 if (likely(!vmx->fail)) { 4596 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4597 nested_exit_intr_ack_set(vcpu)) { 4598 int irq = kvm_cpu_get_interrupt(vcpu); 4599 WARN_ON(irq < 0); 4600 vmcs12->vm_exit_intr_info = irq | 4601 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4602 } 4603 4604 if (vm_exit_reason != -1) 4605 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4606 vmcs12->exit_qualification, 4607 vmcs12->idt_vectoring_info_field, 4608 vmcs12->vm_exit_intr_info, 4609 vmcs12->vm_exit_intr_error_code, 4610 KVM_ISA_VMX); 4611 4612 load_vmcs12_host_state(vcpu, vmcs12); 4613 4614 return; 4615 } 4616 4617 /* 4618 * After an early L2 VM-entry failure, we're now back 4619 * in L1 which thinks it just finished a VMLAUNCH or 4620 * VMRESUME instruction, so we need to set the failure 4621 * flag and the VM-instruction error field of the VMCS 4622 * accordingly, and skip the emulated instruction. 4623 */ 4624 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4625 4626 /* 4627 * Restore L1's host state to KVM's software model. We're here 4628 * because a consistency check was caught by hardware, which 4629 * means some amount of guest state has been propagated to KVM's 4630 * model and needs to be unwound to the host's state. 4631 */ 4632 nested_vmx_restore_host_state(vcpu); 4633 4634 vmx->fail = 0; 4635 } 4636 4637 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4638 { 4639 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4640 } 4641 4642 /* 4643 * Decode the memory-address operand of a vmx instruction, as recorded on an 4644 * exit caused by such an instruction (run by a guest hypervisor). 4645 * On success, returns 0. When the operand is invalid, returns 1 and throws 4646 * #UD, #GP, or #SS. 4647 */ 4648 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4649 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4650 { 4651 gva_t off; 4652 bool exn; 4653 struct kvm_segment s; 4654 4655 /* 4656 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4657 * Execution", on an exit, vmx_instruction_info holds most of the 4658 * addressing components of the operand. Only the displacement part 4659 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4660 * For how an actual address is calculated from all these components, 4661 * refer to Vol. 1, "Operand Addressing". 4662 */ 4663 int scaling = vmx_instruction_info & 3; 4664 int addr_size = (vmx_instruction_info >> 7) & 7; 4665 bool is_reg = vmx_instruction_info & (1u << 10); 4666 int seg_reg = (vmx_instruction_info >> 15) & 7; 4667 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4668 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4669 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4670 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4671 4672 if (is_reg) { 4673 kvm_queue_exception(vcpu, UD_VECTOR); 4674 return 1; 4675 } 4676 4677 /* Addr = segment_base + offset */ 4678 /* offset = base + [index * scale] + displacement */ 4679 off = exit_qualification; /* holds the displacement */ 4680 if (addr_size == 1) 4681 off = (gva_t)sign_extend64(off, 31); 4682 else if (addr_size == 0) 4683 off = (gva_t)sign_extend64(off, 15); 4684 if (base_is_valid) 4685 off += kvm_register_read(vcpu, base_reg); 4686 if (index_is_valid) 4687 off += kvm_register_read(vcpu, index_reg) << scaling; 4688 vmx_get_segment(vcpu, &s, seg_reg); 4689 4690 /* 4691 * The effective address, i.e. @off, of a memory operand is truncated 4692 * based on the address size of the instruction. Note that this is 4693 * the *effective address*, i.e. the address prior to accounting for 4694 * the segment's base. 4695 */ 4696 if (addr_size == 1) /* 32 bit */ 4697 off &= 0xffffffff; 4698 else if (addr_size == 0) /* 16 bit */ 4699 off &= 0xffff; 4700 4701 /* Checks for #GP/#SS exceptions. */ 4702 exn = false; 4703 if (is_long_mode(vcpu)) { 4704 /* 4705 * The virtual/linear address is never truncated in 64-bit 4706 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4707 * address when using FS/GS with a non-zero base. 4708 */ 4709 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4710 *ret = s.base + off; 4711 else 4712 *ret = off; 4713 4714 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4715 * non-canonical form. This is the only check on the memory 4716 * destination for long mode! 4717 */ 4718 exn = is_noncanonical_address(*ret, vcpu); 4719 } else { 4720 /* 4721 * When not in long mode, the virtual/linear address is 4722 * unconditionally truncated to 32 bits regardless of the 4723 * address size. 4724 */ 4725 *ret = (s.base + off) & 0xffffffff; 4726 4727 /* Protected mode: apply checks for segment validity in the 4728 * following order: 4729 * - segment type check (#GP(0) may be thrown) 4730 * - usability check (#GP(0)/#SS(0)) 4731 * - limit check (#GP(0)/#SS(0)) 4732 */ 4733 if (wr) 4734 /* #GP(0) if the destination operand is located in a 4735 * read-only data segment or any code segment. 4736 */ 4737 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4738 else 4739 /* #GP(0) if the source operand is located in an 4740 * execute-only code segment 4741 */ 4742 exn = ((s.type & 0xa) == 8); 4743 if (exn) { 4744 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4745 return 1; 4746 } 4747 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4748 */ 4749 exn = (s.unusable != 0); 4750 4751 /* 4752 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4753 * outside the segment limit. All CPUs that support VMX ignore 4754 * limit checks for flat segments, i.e. segments with base==0, 4755 * limit==0xffffffff and of type expand-up data or code. 4756 */ 4757 if (!(s.base == 0 && s.limit == 0xffffffff && 4758 ((s.type & 8) || !(s.type & 4)))) 4759 exn = exn || ((u64)off + len - 1 > s.limit); 4760 } 4761 if (exn) { 4762 kvm_queue_exception_e(vcpu, 4763 seg_reg == VCPU_SREG_SS ? 4764 SS_VECTOR : GP_VECTOR, 4765 0); 4766 return 1; 4767 } 4768 4769 return 0; 4770 } 4771 4772 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 4773 { 4774 struct vcpu_vmx *vmx; 4775 4776 if (!nested_vmx_allowed(vcpu)) 4777 return; 4778 4779 vmx = to_vmx(vcpu); 4780 if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { 4781 vmx->nested.msrs.entry_ctls_high |= 4782 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4783 vmx->nested.msrs.exit_ctls_high |= 4784 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4785 } else { 4786 vmx->nested.msrs.entry_ctls_high &= 4787 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4788 vmx->nested.msrs.exit_ctls_high &= 4789 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4790 } 4791 } 4792 4793 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 4794 int *ret) 4795 { 4796 gva_t gva; 4797 struct x86_exception e; 4798 int r; 4799 4800 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 4801 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4802 sizeof(*vmpointer), &gva)) { 4803 *ret = 1; 4804 return -EINVAL; 4805 } 4806 4807 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 4808 if (r != X86EMUL_CONTINUE) { 4809 *ret = kvm_handle_memory_failure(vcpu, r, &e); 4810 return -EINVAL; 4811 } 4812 4813 return 0; 4814 } 4815 4816 /* 4817 * Allocate a shadow VMCS and associate it with the currently loaded 4818 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4819 * VMCS is also VMCLEARed, so that it is ready for use. 4820 */ 4821 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4822 { 4823 struct vcpu_vmx *vmx = to_vmx(vcpu); 4824 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4825 4826 /* 4827 * We should allocate a shadow vmcs for vmcs01 only when L1 4828 * executes VMXON and free it when L1 executes VMXOFF. 4829 * As it is invalid to execute VMXON twice, we shouldn't reach 4830 * here when vmcs01 already have an allocated shadow vmcs. 4831 */ 4832 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4833 4834 if (!loaded_vmcs->shadow_vmcs) { 4835 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4836 if (loaded_vmcs->shadow_vmcs) 4837 vmcs_clear(loaded_vmcs->shadow_vmcs); 4838 } 4839 return loaded_vmcs->shadow_vmcs; 4840 } 4841 4842 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4843 { 4844 struct vcpu_vmx *vmx = to_vmx(vcpu); 4845 int r; 4846 4847 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4848 if (r < 0) 4849 goto out_vmcs02; 4850 4851 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4852 if (!vmx->nested.cached_vmcs12) 4853 goto out_cached_vmcs12; 4854 4855 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 4856 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4857 if (!vmx->nested.cached_shadow_vmcs12) 4858 goto out_cached_shadow_vmcs12; 4859 4860 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4861 goto out_shadow_vmcs; 4862 4863 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4864 HRTIMER_MODE_ABS_PINNED); 4865 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4866 4867 vmx->nested.vpid02 = allocate_vpid(); 4868 4869 vmx->nested.vmcs02_initialized = false; 4870 vmx->nested.vmxon = true; 4871 4872 if (vmx_pt_mode_is_host_guest()) { 4873 vmx->pt_desc.guest.ctl = 0; 4874 pt_update_intercept_for_msr(vcpu); 4875 } 4876 4877 return 0; 4878 4879 out_shadow_vmcs: 4880 kfree(vmx->nested.cached_shadow_vmcs12); 4881 4882 out_cached_shadow_vmcs12: 4883 kfree(vmx->nested.cached_vmcs12); 4884 4885 out_cached_vmcs12: 4886 free_loaded_vmcs(&vmx->nested.vmcs02); 4887 4888 out_vmcs02: 4889 return -ENOMEM; 4890 } 4891 4892 /* Emulate the VMXON instruction. */ 4893 static int handle_vmon(struct kvm_vcpu *vcpu) 4894 { 4895 int ret; 4896 gpa_t vmptr; 4897 uint32_t revision; 4898 struct vcpu_vmx *vmx = to_vmx(vcpu); 4899 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 4900 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 4901 4902 /* 4903 * The Intel VMX Instruction Reference lists a bunch of bits that are 4904 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4905 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). 4906 * Otherwise, we should fail with #UD. But most faulting conditions 4907 * have already been checked by hardware, prior to the VM-exit for 4908 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4909 * that bit set to 1 in non-root mode. 4910 */ 4911 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4912 kvm_queue_exception(vcpu, UD_VECTOR); 4913 return 1; 4914 } 4915 4916 /* CPL=0 must be checked manually. */ 4917 if (vmx_get_cpl(vcpu)) { 4918 kvm_inject_gp(vcpu, 0); 4919 return 1; 4920 } 4921 4922 if (vmx->nested.vmxon) 4923 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4924 4925 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4926 != VMXON_NEEDED_FEATURES) { 4927 kvm_inject_gp(vcpu, 0); 4928 return 1; 4929 } 4930 4931 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 4932 return ret; 4933 4934 /* 4935 * SDM 3: 24.11.5 4936 * The first 4 bytes of VMXON region contain the supported 4937 * VMCS revision identifier 4938 * 4939 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4940 * which replaces physical address width with 32 4941 */ 4942 if (!page_address_valid(vcpu, vmptr)) 4943 return nested_vmx_failInvalid(vcpu); 4944 4945 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4946 revision != VMCS12_REVISION) 4947 return nested_vmx_failInvalid(vcpu); 4948 4949 vmx->nested.vmxon_ptr = vmptr; 4950 ret = enter_vmx_operation(vcpu); 4951 if (ret) 4952 return ret; 4953 4954 return nested_vmx_succeed(vcpu); 4955 } 4956 4957 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4958 { 4959 struct vcpu_vmx *vmx = to_vmx(vcpu); 4960 4961 if (vmx->nested.current_vmptr == INVALID_GPA) 4962 return; 4963 4964 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4965 4966 if (enable_shadow_vmcs) { 4967 /* copy to memory all shadowed fields in case 4968 they were modified */ 4969 copy_shadow_to_vmcs12(vmx); 4970 vmx_disable_shadow_vmcs(vmx); 4971 } 4972 vmx->nested.posted_intr_nv = -1; 4973 4974 /* Flush VMCS12 to guest memory */ 4975 kvm_vcpu_write_guest_page(vcpu, 4976 vmx->nested.current_vmptr >> PAGE_SHIFT, 4977 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4978 4979 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4980 4981 vmx->nested.current_vmptr = INVALID_GPA; 4982 } 4983 4984 /* Emulate the VMXOFF instruction */ 4985 static int handle_vmoff(struct kvm_vcpu *vcpu) 4986 { 4987 if (!nested_vmx_check_permission(vcpu)) 4988 return 1; 4989 4990 free_nested(vcpu); 4991 4992 /* Process a latched INIT during time CPU was in VMX operation */ 4993 kvm_make_request(KVM_REQ_EVENT, vcpu); 4994 4995 return nested_vmx_succeed(vcpu); 4996 } 4997 4998 /* Emulate the VMCLEAR instruction */ 4999 static int handle_vmclear(struct kvm_vcpu *vcpu) 5000 { 5001 struct vcpu_vmx *vmx = to_vmx(vcpu); 5002 u32 zero = 0; 5003 gpa_t vmptr; 5004 u64 evmcs_gpa; 5005 int r; 5006 5007 if (!nested_vmx_check_permission(vcpu)) 5008 return 1; 5009 5010 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5011 return r; 5012 5013 if (!page_address_valid(vcpu, vmptr)) 5014 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5015 5016 if (vmptr == vmx->nested.vmxon_ptr) 5017 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5018 5019 /* 5020 * When Enlightened VMEntry is enabled on the calling CPU we treat 5021 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 5022 * way to distinguish it from VMCS12) and we must not corrupt it by 5023 * writing to the non-existent 'launch_state' field. The area doesn't 5024 * have to be the currently active EVMCS on the calling CPU and there's 5025 * nothing KVM has to do to transition it from 'active' to 'non-active' 5026 * state. It is possible that the area will stay mapped as 5027 * vmx->nested.hv_evmcs but this shouldn't be a problem. 5028 */ 5029 if (likely(!vmx->nested.enlightened_vmcs_enabled || 5030 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 5031 if (vmptr == vmx->nested.current_vmptr) 5032 nested_release_vmcs12(vcpu); 5033 5034 kvm_vcpu_write_guest(vcpu, 5035 vmptr + offsetof(struct vmcs12, 5036 launch_state), 5037 &zero, sizeof(zero)); 5038 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { 5039 nested_release_evmcs(vcpu); 5040 } 5041 5042 return nested_vmx_succeed(vcpu); 5043 } 5044 5045 /* Emulate the VMLAUNCH instruction */ 5046 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5047 { 5048 return nested_vmx_run(vcpu, true); 5049 } 5050 5051 /* Emulate the VMRESUME instruction */ 5052 static int handle_vmresume(struct kvm_vcpu *vcpu) 5053 { 5054 5055 return nested_vmx_run(vcpu, false); 5056 } 5057 5058 static int handle_vmread(struct kvm_vcpu *vcpu) 5059 { 5060 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5061 : get_vmcs12(vcpu); 5062 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5063 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5064 struct vcpu_vmx *vmx = to_vmx(vcpu); 5065 struct x86_exception e; 5066 unsigned long field; 5067 u64 value; 5068 gva_t gva = 0; 5069 short offset; 5070 int len, r; 5071 5072 if (!nested_vmx_check_permission(vcpu)) 5073 return 1; 5074 5075 /* 5076 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5077 * any VMREAD sets the ALU flags for VMfailInvalid. 5078 */ 5079 if (vmx->nested.current_vmptr == INVALID_GPA || 5080 (is_guest_mode(vcpu) && 5081 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5082 return nested_vmx_failInvalid(vcpu); 5083 5084 /* Decode instruction info and find the field to read */ 5085 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5086 5087 offset = vmcs_field_to_offset(field); 5088 if (offset < 0) 5089 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5090 5091 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5092 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5093 5094 /* Read the field, zero-extended to a u64 value */ 5095 value = vmcs12_read_any(vmcs12, field, offset); 5096 5097 /* 5098 * Now copy part of this value to register or memory, as requested. 5099 * Note that the number of bits actually copied is 32 or 64 depending 5100 * on the guest's mode (32 or 64 bit), not on the given field's length. 5101 */ 5102 if (instr_info & BIT(10)) { 5103 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5104 } else { 5105 len = is_64_bit_mode(vcpu) ? 8 : 4; 5106 if (get_vmx_mem_address(vcpu, exit_qualification, 5107 instr_info, true, len, &gva)) 5108 return 1; 5109 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5110 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5111 if (r != X86EMUL_CONTINUE) 5112 return kvm_handle_memory_failure(vcpu, r, &e); 5113 } 5114 5115 return nested_vmx_succeed(vcpu); 5116 } 5117 5118 static bool is_shadow_field_rw(unsigned long field) 5119 { 5120 switch (field) { 5121 #define SHADOW_FIELD_RW(x, y) case x: 5122 #include "vmcs_shadow_fields.h" 5123 return true; 5124 default: 5125 break; 5126 } 5127 return false; 5128 } 5129 5130 static bool is_shadow_field_ro(unsigned long field) 5131 { 5132 switch (field) { 5133 #define SHADOW_FIELD_RO(x, y) case x: 5134 #include "vmcs_shadow_fields.h" 5135 return true; 5136 default: 5137 break; 5138 } 5139 return false; 5140 } 5141 5142 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5143 { 5144 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5145 : get_vmcs12(vcpu); 5146 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5147 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5148 struct vcpu_vmx *vmx = to_vmx(vcpu); 5149 struct x86_exception e; 5150 unsigned long field; 5151 short offset; 5152 gva_t gva; 5153 int len, r; 5154 5155 /* 5156 * The value to write might be 32 or 64 bits, depending on L1's long 5157 * mode, and eventually we need to write that into a field of several 5158 * possible lengths. The code below first zero-extends the value to 64 5159 * bit (value), and then copies only the appropriate number of 5160 * bits into the vmcs12 field. 5161 */ 5162 u64 value = 0; 5163 5164 if (!nested_vmx_check_permission(vcpu)) 5165 return 1; 5166 5167 /* 5168 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5169 * any VMWRITE sets the ALU flags for VMfailInvalid. 5170 */ 5171 if (vmx->nested.current_vmptr == INVALID_GPA || 5172 (is_guest_mode(vcpu) && 5173 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5174 return nested_vmx_failInvalid(vcpu); 5175 5176 if (instr_info & BIT(10)) 5177 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5178 else { 5179 len = is_64_bit_mode(vcpu) ? 8 : 4; 5180 if (get_vmx_mem_address(vcpu, exit_qualification, 5181 instr_info, false, len, &gva)) 5182 return 1; 5183 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5184 if (r != X86EMUL_CONTINUE) 5185 return kvm_handle_memory_failure(vcpu, r, &e); 5186 } 5187 5188 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5189 5190 offset = vmcs_field_to_offset(field); 5191 if (offset < 0) 5192 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5193 5194 /* 5195 * If the vCPU supports "VMWRITE to any supported field in the 5196 * VMCS," then the "read-only" fields are actually read/write. 5197 */ 5198 if (vmcs_field_readonly(field) && 5199 !nested_cpu_has_vmwrite_any_field(vcpu)) 5200 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5201 5202 /* 5203 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5204 * vmcs12, else we may crush a field or consume a stale value. 5205 */ 5206 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5207 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5208 5209 /* 5210 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5211 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5212 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5213 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5214 * from L1 will return a different value than VMREAD from L2 (L1 sees 5215 * the stripped down value, L2 sees the full value as stored by KVM). 5216 */ 5217 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5218 value &= 0x1f0ff; 5219 5220 vmcs12_write_any(vmcs12, field, offset, value); 5221 5222 /* 5223 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5224 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5225 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5226 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5227 */ 5228 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5229 /* 5230 * L1 can read these fields without exiting, ensure the 5231 * shadow VMCS is up-to-date. 5232 */ 5233 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5234 preempt_disable(); 5235 vmcs_load(vmx->vmcs01.shadow_vmcs); 5236 5237 __vmcs_writel(field, value); 5238 5239 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5240 vmcs_load(vmx->loaded_vmcs->vmcs); 5241 preempt_enable(); 5242 } 5243 vmx->nested.dirty_vmcs12 = true; 5244 } 5245 5246 return nested_vmx_succeed(vcpu); 5247 } 5248 5249 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5250 { 5251 vmx->nested.current_vmptr = vmptr; 5252 if (enable_shadow_vmcs) { 5253 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5254 vmcs_write64(VMCS_LINK_POINTER, 5255 __pa(vmx->vmcs01.shadow_vmcs)); 5256 vmx->nested.need_vmcs12_to_shadow_sync = true; 5257 } 5258 vmx->nested.dirty_vmcs12 = true; 5259 } 5260 5261 /* Emulate the VMPTRLD instruction */ 5262 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5263 { 5264 struct vcpu_vmx *vmx = to_vmx(vcpu); 5265 gpa_t vmptr; 5266 int r; 5267 5268 if (!nested_vmx_check_permission(vcpu)) 5269 return 1; 5270 5271 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5272 return r; 5273 5274 if (!page_address_valid(vcpu, vmptr)) 5275 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5276 5277 if (vmptr == vmx->nested.vmxon_ptr) 5278 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5279 5280 /* Forbid normal VMPTRLD if Enlightened version was used */ 5281 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 5282 return 1; 5283 5284 if (vmx->nested.current_vmptr != vmptr) { 5285 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5286 struct vmcs_hdr hdr; 5287 5288 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5289 /* 5290 * Reads from an unbacked page return all 1s, 5291 * which means that the 32 bits located at the 5292 * given physical address won't match the required 5293 * VMCS12_REVISION identifier. 5294 */ 5295 return nested_vmx_fail(vcpu, 5296 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5297 } 5298 5299 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5300 offsetof(struct vmcs12, hdr), 5301 sizeof(hdr))) { 5302 return nested_vmx_fail(vcpu, 5303 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5304 } 5305 5306 if (hdr.revision_id != VMCS12_REVISION || 5307 (hdr.shadow_vmcs && 5308 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5309 return nested_vmx_fail(vcpu, 5310 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5311 } 5312 5313 nested_release_vmcs12(vcpu); 5314 5315 /* 5316 * Load VMCS12 from guest memory since it is not already 5317 * cached. 5318 */ 5319 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5320 VMCS12_SIZE)) { 5321 return nested_vmx_fail(vcpu, 5322 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5323 } 5324 5325 set_current_vmptr(vmx, vmptr); 5326 } 5327 5328 return nested_vmx_succeed(vcpu); 5329 } 5330 5331 /* Emulate the VMPTRST instruction */ 5332 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5333 { 5334 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5335 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5336 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5337 struct x86_exception e; 5338 gva_t gva; 5339 int r; 5340 5341 if (!nested_vmx_check_permission(vcpu)) 5342 return 1; 5343 5344 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) 5345 return 1; 5346 5347 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5348 true, sizeof(gpa_t), &gva)) 5349 return 1; 5350 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5351 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5352 sizeof(gpa_t), &e); 5353 if (r != X86EMUL_CONTINUE) 5354 return kvm_handle_memory_failure(vcpu, r, &e); 5355 5356 return nested_vmx_succeed(vcpu); 5357 } 5358 5359 /* Emulate the INVEPT instruction */ 5360 static int handle_invept(struct kvm_vcpu *vcpu) 5361 { 5362 struct vcpu_vmx *vmx = to_vmx(vcpu); 5363 u32 vmx_instruction_info, types; 5364 unsigned long type, roots_to_free; 5365 struct kvm_mmu *mmu; 5366 gva_t gva; 5367 struct x86_exception e; 5368 struct { 5369 u64 eptp, gpa; 5370 } operand; 5371 int i, r, gpr_index; 5372 5373 if (!(vmx->nested.msrs.secondary_ctls_high & 5374 SECONDARY_EXEC_ENABLE_EPT) || 5375 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5376 kvm_queue_exception(vcpu, UD_VECTOR); 5377 return 1; 5378 } 5379 5380 if (!nested_vmx_check_permission(vcpu)) 5381 return 1; 5382 5383 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5384 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5385 type = kvm_register_read(vcpu, gpr_index); 5386 5387 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5388 5389 if (type >= 32 || !(types & (1 << type))) 5390 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5391 5392 /* According to the Intel VMX instruction reference, the memory 5393 * operand is read even if it isn't needed (e.g., for type==global) 5394 */ 5395 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5396 vmx_instruction_info, false, sizeof(operand), &gva)) 5397 return 1; 5398 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5399 if (r != X86EMUL_CONTINUE) 5400 return kvm_handle_memory_failure(vcpu, r, &e); 5401 5402 /* 5403 * Nested EPT roots are always held through guest_mmu, 5404 * not root_mmu. 5405 */ 5406 mmu = &vcpu->arch.guest_mmu; 5407 5408 switch (type) { 5409 case VMX_EPT_EXTENT_CONTEXT: 5410 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5411 return nested_vmx_fail(vcpu, 5412 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5413 5414 roots_to_free = 0; 5415 if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, 5416 operand.eptp)) 5417 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5418 5419 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5420 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5421 mmu->prev_roots[i].pgd, 5422 operand.eptp)) 5423 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5424 } 5425 break; 5426 case VMX_EPT_EXTENT_GLOBAL: 5427 roots_to_free = KVM_MMU_ROOTS_ALL; 5428 break; 5429 default: 5430 BUG(); 5431 break; 5432 } 5433 5434 if (roots_to_free) 5435 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 5436 5437 return nested_vmx_succeed(vcpu); 5438 } 5439 5440 static int handle_invvpid(struct kvm_vcpu *vcpu) 5441 { 5442 struct vcpu_vmx *vmx = to_vmx(vcpu); 5443 u32 vmx_instruction_info; 5444 unsigned long type, types; 5445 gva_t gva; 5446 struct x86_exception e; 5447 struct { 5448 u64 vpid; 5449 u64 gla; 5450 } operand; 5451 u16 vpid02; 5452 int r, gpr_index; 5453 5454 if (!(vmx->nested.msrs.secondary_ctls_high & 5455 SECONDARY_EXEC_ENABLE_VPID) || 5456 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5457 kvm_queue_exception(vcpu, UD_VECTOR); 5458 return 1; 5459 } 5460 5461 if (!nested_vmx_check_permission(vcpu)) 5462 return 1; 5463 5464 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5465 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5466 type = kvm_register_read(vcpu, gpr_index); 5467 5468 types = (vmx->nested.msrs.vpid_caps & 5469 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5470 5471 if (type >= 32 || !(types & (1 << type))) 5472 return nested_vmx_fail(vcpu, 5473 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5474 5475 /* according to the intel vmx instruction reference, the memory 5476 * operand is read even if it isn't needed (e.g., for type==global) 5477 */ 5478 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5479 vmx_instruction_info, false, sizeof(operand), &gva)) 5480 return 1; 5481 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5482 if (r != X86EMUL_CONTINUE) 5483 return kvm_handle_memory_failure(vcpu, r, &e); 5484 5485 if (operand.vpid >> 16) 5486 return nested_vmx_fail(vcpu, 5487 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5488 5489 vpid02 = nested_get_vpid02(vcpu); 5490 switch (type) { 5491 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5492 if (!operand.vpid || 5493 is_noncanonical_address(operand.gla, vcpu)) 5494 return nested_vmx_fail(vcpu, 5495 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5496 vpid_sync_vcpu_addr(vpid02, operand.gla); 5497 break; 5498 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5499 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5500 if (!operand.vpid) 5501 return nested_vmx_fail(vcpu, 5502 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5503 vpid_sync_context(vpid02); 5504 break; 5505 case VMX_VPID_EXTENT_ALL_CONTEXT: 5506 vpid_sync_context(vpid02); 5507 break; 5508 default: 5509 WARN_ON_ONCE(1); 5510 return kvm_skip_emulated_instruction(vcpu); 5511 } 5512 5513 /* 5514 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5515 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5516 * roots as VPIDs are not tracked in the MMU role. 5517 * 5518 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5519 * an MMU when EPT is disabled. 5520 * 5521 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5522 */ 5523 if (!enable_ept) 5524 kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu); 5525 5526 return nested_vmx_succeed(vcpu); 5527 } 5528 5529 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5530 struct vmcs12 *vmcs12) 5531 { 5532 u32 index = kvm_rcx_read(vcpu); 5533 u64 new_eptp; 5534 5535 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5536 return 1; 5537 if (index >= VMFUNC_EPTP_ENTRIES) 5538 return 1; 5539 5540 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5541 &new_eptp, index * 8, 8)) 5542 return 1; 5543 5544 /* 5545 * If the (L2) guest does a vmfunc to the currently 5546 * active ept pointer, we don't have to do anything else 5547 */ 5548 if (vmcs12->ept_pointer != new_eptp) { 5549 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5550 return 1; 5551 5552 vmcs12->ept_pointer = new_eptp; 5553 nested_ept_new_eptp(vcpu); 5554 5555 if (!nested_cpu_has_vpid(vmcs12)) 5556 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5557 } 5558 5559 return 0; 5560 } 5561 5562 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5563 { 5564 struct vcpu_vmx *vmx = to_vmx(vcpu); 5565 struct vmcs12 *vmcs12; 5566 u32 function = kvm_rax_read(vcpu); 5567 5568 /* 5569 * VMFUNC is only supported for nested guests, but we always enable the 5570 * secondary control for simplicity; for non-nested mode, fake that we 5571 * didn't by injecting #UD. 5572 */ 5573 if (!is_guest_mode(vcpu)) { 5574 kvm_queue_exception(vcpu, UD_VECTOR); 5575 return 1; 5576 } 5577 5578 vmcs12 = get_vmcs12(vcpu); 5579 5580 /* 5581 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5582 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5583 */ 5584 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5585 kvm_queue_exception(vcpu, UD_VECTOR); 5586 return 1; 5587 } 5588 5589 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5590 goto fail; 5591 5592 switch (function) { 5593 case 0: 5594 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5595 goto fail; 5596 break; 5597 default: 5598 goto fail; 5599 } 5600 return kvm_skip_emulated_instruction(vcpu); 5601 5602 fail: 5603 /* 5604 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5605 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5606 * EXIT_REASON_VMFUNC as the exit reason. 5607 */ 5608 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5609 vmx_get_intr_info(vcpu), 5610 vmx_get_exit_qual(vcpu)); 5611 return 1; 5612 } 5613 5614 /* 5615 * Return true if an IO instruction with the specified port and size should cause 5616 * a VM-exit into L1. 5617 */ 5618 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5619 int size) 5620 { 5621 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5622 gpa_t bitmap, last_bitmap; 5623 u8 b; 5624 5625 last_bitmap = INVALID_GPA; 5626 b = -1; 5627 5628 while (size > 0) { 5629 if (port < 0x8000) 5630 bitmap = vmcs12->io_bitmap_a; 5631 else if (port < 0x10000) 5632 bitmap = vmcs12->io_bitmap_b; 5633 else 5634 return true; 5635 bitmap += (port & 0x7fff) / 8; 5636 5637 if (last_bitmap != bitmap) 5638 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5639 return true; 5640 if (b & (1 << (port & 7))) 5641 return true; 5642 5643 port++; 5644 size--; 5645 last_bitmap = bitmap; 5646 } 5647 5648 return false; 5649 } 5650 5651 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5652 struct vmcs12 *vmcs12) 5653 { 5654 unsigned long exit_qualification; 5655 unsigned short port; 5656 int size; 5657 5658 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5659 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5660 5661 exit_qualification = vmx_get_exit_qual(vcpu); 5662 5663 port = exit_qualification >> 16; 5664 size = (exit_qualification & 7) + 1; 5665 5666 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5667 } 5668 5669 /* 5670 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5671 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5672 * disinterest in the current event (read or write a specific MSR) by using an 5673 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5674 */ 5675 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5676 struct vmcs12 *vmcs12, 5677 union vmx_exit_reason exit_reason) 5678 { 5679 u32 msr_index = kvm_rcx_read(vcpu); 5680 gpa_t bitmap; 5681 5682 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5683 return true; 5684 5685 /* 5686 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5687 * for the four combinations of read/write and low/high MSR numbers. 5688 * First we need to figure out which of the four to use: 5689 */ 5690 bitmap = vmcs12->msr_bitmap; 5691 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5692 bitmap += 2048; 5693 if (msr_index >= 0xc0000000) { 5694 msr_index -= 0xc0000000; 5695 bitmap += 1024; 5696 } 5697 5698 /* Then read the msr_index'th bit from this bitmap: */ 5699 if (msr_index < 1024*8) { 5700 unsigned char b; 5701 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5702 return true; 5703 return 1 & (b >> (msr_index & 7)); 5704 } else 5705 return true; /* let L1 handle the wrong parameter */ 5706 } 5707 5708 /* 5709 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5710 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5711 * intercept (via guest_host_mask etc.) the current event. 5712 */ 5713 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5714 struct vmcs12 *vmcs12) 5715 { 5716 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5717 int cr = exit_qualification & 15; 5718 int reg; 5719 unsigned long val; 5720 5721 switch ((exit_qualification >> 4) & 3) { 5722 case 0: /* mov to cr */ 5723 reg = (exit_qualification >> 8) & 15; 5724 val = kvm_register_read(vcpu, reg); 5725 switch (cr) { 5726 case 0: 5727 if (vmcs12->cr0_guest_host_mask & 5728 (val ^ vmcs12->cr0_read_shadow)) 5729 return true; 5730 break; 5731 case 3: 5732 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5733 return true; 5734 break; 5735 case 4: 5736 if (vmcs12->cr4_guest_host_mask & 5737 (vmcs12->cr4_read_shadow ^ val)) 5738 return true; 5739 break; 5740 case 8: 5741 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5742 return true; 5743 break; 5744 } 5745 break; 5746 case 2: /* clts */ 5747 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5748 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5749 return true; 5750 break; 5751 case 1: /* mov from cr */ 5752 switch (cr) { 5753 case 3: 5754 if (vmcs12->cpu_based_vm_exec_control & 5755 CPU_BASED_CR3_STORE_EXITING) 5756 return true; 5757 break; 5758 case 8: 5759 if (vmcs12->cpu_based_vm_exec_control & 5760 CPU_BASED_CR8_STORE_EXITING) 5761 return true; 5762 break; 5763 } 5764 break; 5765 case 3: /* lmsw */ 5766 /* 5767 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5768 * cr0. Other attempted changes are ignored, with no exit. 5769 */ 5770 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5771 if (vmcs12->cr0_guest_host_mask & 0xe & 5772 (val ^ vmcs12->cr0_read_shadow)) 5773 return true; 5774 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5775 !(vmcs12->cr0_read_shadow & 0x1) && 5776 (val & 0x1)) 5777 return true; 5778 break; 5779 } 5780 return false; 5781 } 5782 5783 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 5784 struct vmcs12 *vmcs12) 5785 { 5786 u32 encls_leaf; 5787 5788 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 5789 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 5790 return false; 5791 5792 encls_leaf = kvm_rax_read(vcpu); 5793 if (encls_leaf > 62) 5794 encls_leaf = 63; 5795 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 5796 } 5797 5798 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5799 struct vmcs12 *vmcs12, gpa_t bitmap) 5800 { 5801 u32 vmx_instruction_info; 5802 unsigned long field; 5803 u8 b; 5804 5805 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5806 return true; 5807 5808 /* Decode instruction info and find the field to access */ 5809 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5810 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5811 5812 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5813 if (field >> 15) 5814 return true; 5815 5816 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5817 return true; 5818 5819 return 1 & (b >> (field & 7)); 5820 } 5821 5822 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 5823 { 5824 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 5825 5826 if (nested_cpu_has_mtf(vmcs12)) 5827 return true; 5828 5829 /* 5830 * An MTF VM-exit may be injected into the guest by setting the 5831 * interruption-type to 7 (other event) and the vector field to 0. Such 5832 * is the case regardless of the 'monitor trap flag' VM-execution 5833 * control. 5834 */ 5835 return entry_intr_info == (INTR_INFO_VALID_MASK 5836 | INTR_TYPE_OTHER_EVENT); 5837 } 5838 5839 /* 5840 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 5841 * L1 wants the exit. Only call this when in is_guest_mode (L2). 5842 */ 5843 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 5844 union vmx_exit_reason exit_reason) 5845 { 5846 u32 intr_info; 5847 5848 switch ((u16)exit_reason.basic) { 5849 case EXIT_REASON_EXCEPTION_NMI: 5850 intr_info = vmx_get_intr_info(vcpu); 5851 if (is_nmi(intr_info)) 5852 return true; 5853 else if (is_page_fault(intr_info)) 5854 return vcpu->arch.apf.host_apf_flags || 5855 vmx_need_pf_intercept(vcpu); 5856 else if (is_debug(intr_info) && 5857 vcpu->guest_debug & 5858 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5859 return true; 5860 else if (is_breakpoint(intr_info) && 5861 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5862 return true; 5863 else if (is_alignment_check(intr_info) && 5864 !vmx_guest_inject_ac(vcpu)) 5865 return true; 5866 return false; 5867 case EXIT_REASON_EXTERNAL_INTERRUPT: 5868 return true; 5869 case EXIT_REASON_MCE_DURING_VMENTRY: 5870 return true; 5871 case EXIT_REASON_EPT_VIOLATION: 5872 /* 5873 * L0 always deals with the EPT violation. If nested EPT is 5874 * used, and the nested mmu code discovers that the address is 5875 * missing in the guest EPT table (EPT12), the EPT violation 5876 * will be injected with nested_ept_inject_page_fault() 5877 */ 5878 return true; 5879 case EXIT_REASON_EPT_MISCONFIG: 5880 /* 5881 * L2 never uses directly L1's EPT, but rather L0's own EPT 5882 * table (shadow on EPT) or a merged EPT table that L0 built 5883 * (EPT on EPT). So any problems with the structure of the 5884 * table is L0's fault. 5885 */ 5886 return true; 5887 case EXIT_REASON_PREEMPTION_TIMER: 5888 return true; 5889 case EXIT_REASON_PML_FULL: 5890 /* 5891 * PML is emulated for an L1 VMM and should never be enabled in 5892 * vmcs02, always "handle" PML_FULL by exiting to userspace. 5893 */ 5894 return true; 5895 case EXIT_REASON_VMFUNC: 5896 /* VM functions are emulated through L2->L0 vmexits. */ 5897 return true; 5898 case EXIT_REASON_BUS_LOCK: 5899 /* 5900 * At present, bus lock VM exit is never exposed to L1. 5901 * Handle L2's bus locks in L0 directly. 5902 */ 5903 return true; 5904 default: 5905 break; 5906 } 5907 return false; 5908 } 5909 5910 /* 5911 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 5912 * is_guest_mode (L2). 5913 */ 5914 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 5915 union vmx_exit_reason exit_reason) 5916 { 5917 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5918 u32 intr_info; 5919 5920 switch ((u16)exit_reason.basic) { 5921 case EXIT_REASON_EXCEPTION_NMI: 5922 intr_info = vmx_get_intr_info(vcpu); 5923 if (is_nmi(intr_info)) 5924 return true; 5925 else if (is_page_fault(intr_info)) 5926 return true; 5927 return vmcs12->exception_bitmap & 5928 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5929 case EXIT_REASON_EXTERNAL_INTERRUPT: 5930 return nested_exit_on_intr(vcpu); 5931 case EXIT_REASON_TRIPLE_FAULT: 5932 return true; 5933 case EXIT_REASON_INTERRUPT_WINDOW: 5934 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 5935 case EXIT_REASON_NMI_WINDOW: 5936 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 5937 case EXIT_REASON_TASK_SWITCH: 5938 return true; 5939 case EXIT_REASON_CPUID: 5940 return true; 5941 case EXIT_REASON_HLT: 5942 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5943 case EXIT_REASON_INVD: 5944 return true; 5945 case EXIT_REASON_INVLPG: 5946 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5947 case EXIT_REASON_RDPMC: 5948 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5949 case EXIT_REASON_RDRAND: 5950 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5951 case EXIT_REASON_RDSEED: 5952 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5953 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5954 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5955 case EXIT_REASON_VMREAD: 5956 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5957 vmcs12->vmread_bitmap); 5958 case EXIT_REASON_VMWRITE: 5959 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5960 vmcs12->vmwrite_bitmap); 5961 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5962 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5963 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5964 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5965 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5966 /* 5967 * VMX instructions trap unconditionally. This allows L1 to 5968 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5969 */ 5970 return true; 5971 case EXIT_REASON_CR_ACCESS: 5972 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5973 case EXIT_REASON_DR_ACCESS: 5974 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5975 case EXIT_REASON_IO_INSTRUCTION: 5976 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5977 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5978 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5979 case EXIT_REASON_MSR_READ: 5980 case EXIT_REASON_MSR_WRITE: 5981 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5982 case EXIT_REASON_INVALID_STATE: 5983 return true; 5984 case EXIT_REASON_MWAIT_INSTRUCTION: 5985 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5986 case EXIT_REASON_MONITOR_TRAP_FLAG: 5987 return nested_vmx_exit_handled_mtf(vmcs12); 5988 case EXIT_REASON_MONITOR_INSTRUCTION: 5989 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5990 case EXIT_REASON_PAUSE_INSTRUCTION: 5991 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5992 nested_cpu_has2(vmcs12, 5993 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5994 case EXIT_REASON_MCE_DURING_VMENTRY: 5995 return true; 5996 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5997 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5998 case EXIT_REASON_APIC_ACCESS: 5999 case EXIT_REASON_APIC_WRITE: 6000 case EXIT_REASON_EOI_INDUCED: 6001 /* 6002 * The controls for "virtualize APIC accesses," "APIC- 6003 * register virtualization," and "virtual-interrupt 6004 * delivery" only come from vmcs12. 6005 */ 6006 return true; 6007 case EXIT_REASON_INVPCID: 6008 return 6009 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6010 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6011 case EXIT_REASON_WBINVD: 6012 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6013 case EXIT_REASON_XSETBV: 6014 return true; 6015 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6016 /* 6017 * This should never happen, since it is not possible to 6018 * set XSS to a non-zero value---neither in L1 nor in L2. 6019 * If if it were, XSS would have to be checked against 6020 * the XSS exit bitmap in vmcs12. 6021 */ 6022 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 6023 case EXIT_REASON_UMWAIT: 6024 case EXIT_REASON_TPAUSE: 6025 return nested_cpu_has2(vmcs12, 6026 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6027 case EXIT_REASON_ENCLS: 6028 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6029 default: 6030 return true; 6031 } 6032 } 6033 6034 /* 6035 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6036 * reflected into L1. 6037 */ 6038 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6039 { 6040 struct vcpu_vmx *vmx = to_vmx(vcpu); 6041 union vmx_exit_reason exit_reason = vmx->exit_reason; 6042 unsigned long exit_qual; 6043 u32 exit_intr_info; 6044 6045 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6046 6047 /* 6048 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6049 * has already loaded L2's state. 6050 */ 6051 if (unlikely(vmx->fail)) { 6052 trace_kvm_nested_vmenter_failed( 6053 "hardware VM-instruction error: ", 6054 vmcs_read32(VM_INSTRUCTION_ERROR)); 6055 exit_intr_info = 0; 6056 exit_qual = 0; 6057 goto reflect_vmexit; 6058 } 6059 6060 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6061 6062 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6063 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6064 return false; 6065 6066 /* If L1 doesn't want the exit, handle it in L0. */ 6067 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6068 return false; 6069 6070 /* 6071 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6072 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6073 * need to be synthesized by querying the in-kernel LAPIC, but external 6074 * interrupts are never reflected to L1 so it's a non-issue. 6075 */ 6076 exit_intr_info = vmx_get_intr_info(vcpu); 6077 if (is_exception_with_error_code(exit_intr_info)) { 6078 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6079 6080 vmcs12->vm_exit_intr_error_code = 6081 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6082 } 6083 exit_qual = vmx_get_exit_qual(vcpu); 6084 6085 reflect_vmexit: 6086 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6087 return true; 6088 } 6089 6090 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6091 struct kvm_nested_state __user *user_kvm_nested_state, 6092 u32 user_data_size) 6093 { 6094 struct vcpu_vmx *vmx; 6095 struct vmcs12 *vmcs12; 6096 struct kvm_nested_state kvm_state = { 6097 .flags = 0, 6098 .format = KVM_STATE_NESTED_FORMAT_VMX, 6099 .size = sizeof(kvm_state), 6100 .hdr.vmx.flags = 0, 6101 .hdr.vmx.vmxon_pa = INVALID_GPA, 6102 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6103 .hdr.vmx.preemption_timer_deadline = 0, 6104 }; 6105 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6106 &user_kvm_nested_state->data.vmx[0]; 6107 6108 if (!vcpu) 6109 return kvm_state.size + sizeof(*user_vmx_nested_state); 6110 6111 vmx = to_vmx(vcpu); 6112 vmcs12 = get_vmcs12(vcpu); 6113 6114 if (nested_vmx_allowed(vcpu) && 6115 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6116 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6117 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6118 6119 if (vmx_has_valid_vmcs12(vcpu)) { 6120 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6121 6122 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6123 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 6124 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6125 6126 if (is_guest_mode(vcpu) && 6127 nested_cpu_has_shadow_vmcs(vmcs12) && 6128 vmcs12->vmcs_link_pointer != INVALID_GPA) 6129 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6130 } 6131 6132 if (vmx->nested.smm.vmxon) 6133 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6134 6135 if (vmx->nested.smm.guest_mode) 6136 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6137 6138 if (is_guest_mode(vcpu)) { 6139 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6140 6141 if (vmx->nested.nested_run_pending) 6142 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6143 6144 if (vmx->nested.mtf_pending) 6145 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6146 6147 if (nested_cpu_has_preemption_timer(vmcs12) && 6148 vmx->nested.has_preemption_timer_deadline) { 6149 kvm_state.hdr.vmx.flags |= 6150 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6151 kvm_state.hdr.vmx.preemption_timer_deadline = 6152 vmx->nested.preemption_timer_deadline; 6153 } 6154 } 6155 } 6156 6157 if (user_data_size < kvm_state.size) 6158 goto out; 6159 6160 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6161 return -EFAULT; 6162 6163 if (!vmx_has_valid_vmcs12(vcpu)) 6164 goto out; 6165 6166 /* 6167 * When running L2, the authoritative vmcs12 state is in the 6168 * vmcs02. When running L1, the authoritative vmcs12 state is 6169 * in the shadow or enlightened vmcs linked to vmcs01, unless 6170 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6171 * vmcs12 state is in the vmcs12 already. 6172 */ 6173 if (is_guest_mode(vcpu)) { 6174 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6175 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6176 } else { 6177 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6178 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6179 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 6180 /* 6181 * L1 hypervisor is not obliged to keep eVMCS 6182 * clean fields data always up-to-date while 6183 * not in guest mode, 'hv_clean_fields' is only 6184 * supposed to be actual upon vmentry so we need 6185 * to ignore it here and do full copy. 6186 */ 6187 copy_enlightened_to_vmcs12(vmx, 0); 6188 else if (enable_shadow_vmcs) 6189 copy_shadow_to_vmcs12(vmx); 6190 } 6191 } 6192 6193 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6194 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6195 6196 /* 6197 * Copy over the full allocated size of vmcs12 rather than just the size 6198 * of the struct. 6199 */ 6200 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6201 return -EFAULT; 6202 6203 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6204 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6205 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6206 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6207 return -EFAULT; 6208 } 6209 out: 6210 return kvm_state.size; 6211 } 6212 6213 /* 6214 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 6215 */ 6216 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6217 { 6218 if (is_guest_mode(vcpu)) { 6219 to_vmx(vcpu)->nested.nested_run_pending = 0; 6220 nested_vmx_vmexit(vcpu, -1, 0, 0); 6221 } 6222 free_nested(vcpu); 6223 } 6224 6225 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6226 struct kvm_nested_state __user *user_kvm_nested_state, 6227 struct kvm_nested_state *kvm_state) 6228 { 6229 struct vcpu_vmx *vmx = to_vmx(vcpu); 6230 struct vmcs12 *vmcs12; 6231 enum vm_entry_failure_code ignored; 6232 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6233 &user_kvm_nested_state->data.vmx[0]; 6234 int ret; 6235 6236 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6237 return -EINVAL; 6238 6239 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6240 if (kvm_state->hdr.vmx.smm.flags) 6241 return -EINVAL; 6242 6243 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6244 return -EINVAL; 6245 6246 /* 6247 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6248 * enable eVMCS capability on vCPU. However, since then 6249 * code was changed such that flag signals vmcs12 should 6250 * be copied into eVMCS in guest memory. 6251 * 6252 * To preserve backwards compatability, allow user 6253 * to set this flag even when there is no VMXON region. 6254 */ 6255 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6256 return -EINVAL; 6257 } else { 6258 if (!nested_vmx_allowed(vcpu)) 6259 return -EINVAL; 6260 6261 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6262 return -EINVAL; 6263 } 6264 6265 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6266 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6267 return -EINVAL; 6268 6269 if (kvm_state->hdr.vmx.smm.flags & 6270 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6271 return -EINVAL; 6272 6273 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6274 return -EINVAL; 6275 6276 /* 6277 * SMM temporarily disables VMX, so we cannot be in guest mode, 6278 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6279 * must be zero. 6280 */ 6281 if (is_smm(vcpu) ? 6282 (kvm_state->flags & 6283 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6284 : kvm_state->hdr.vmx.smm.flags) 6285 return -EINVAL; 6286 6287 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6288 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6289 return -EINVAL; 6290 6291 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6292 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6293 return -EINVAL; 6294 6295 vmx_leave_nested(vcpu); 6296 6297 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6298 return 0; 6299 6300 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6301 ret = enter_vmx_operation(vcpu); 6302 if (ret) 6303 return ret; 6304 6305 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6306 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6307 /* See vmx_has_valid_vmcs12. */ 6308 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6309 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6310 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6311 return -EINVAL; 6312 else 6313 return 0; 6314 } 6315 6316 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6317 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6318 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6319 return -EINVAL; 6320 6321 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6322 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6323 /* 6324 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6325 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6326 * restored yet. EVMCS will be mapped from 6327 * nested_get_vmcs12_pages(). 6328 */ 6329 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6330 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6331 } else { 6332 return -EINVAL; 6333 } 6334 6335 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6336 vmx->nested.smm.vmxon = true; 6337 vmx->nested.vmxon = false; 6338 6339 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6340 vmx->nested.smm.guest_mode = true; 6341 } 6342 6343 vmcs12 = get_vmcs12(vcpu); 6344 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6345 return -EFAULT; 6346 6347 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6348 return -EINVAL; 6349 6350 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6351 return 0; 6352 6353 vmx->nested.nested_run_pending = 6354 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6355 6356 vmx->nested.mtf_pending = 6357 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6358 6359 ret = -EINVAL; 6360 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6361 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6362 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6363 6364 if (kvm_state->size < 6365 sizeof(*kvm_state) + 6366 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6367 goto error_guest_mode; 6368 6369 if (copy_from_user(shadow_vmcs12, 6370 user_vmx_nested_state->shadow_vmcs12, 6371 sizeof(*shadow_vmcs12))) { 6372 ret = -EFAULT; 6373 goto error_guest_mode; 6374 } 6375 6376 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6377 !shadow_vmcs12->hdr.shadow_vmcs) 6378 goto error_guest_mode; 6379 } 6380 6381 vmx->nested.has_preemption_timer_deadline = false; 6382 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6383 vmx->nested.has_preemption_timer_deadline = true; 6384 vmx->nested.preemption_timer_deadline = 6385 kvm_state->hdr.vmx.preemption_timer_deadline; 6386 } 6387 6388 if (nested_vmx_check_controls(vcpu, vmcs12) || 6389 nested_vmx_check_host_state(vcpu, vmcs12) || 6390 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6391 goto error_guest_mode; 6392 6393 vmx->nested.dirty_vmcs12 = true; 6394 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6395 if (ret) 6396 goto error_guest_mode; 6397 6398 return 0; 6399 6400 error_guest_mode: 6401 vmx->nested.nested_run_pending = 0; 6402 return ret; 6403 } 6404 6405 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6406 { 6407 if (enable_shadow_vmcs) { 6408 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6409 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6410 } 6411 } 6412 6413 /* 6414 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6415 * that madness to get the encoding for comparison. 6416 */ 6417 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6418 6419 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6420 { 6421 /* 6422 * Note these are the so called "index" of the VMCS field encoding, not 6423 * the index into vmcs12. 6424 */ 6425 unsigned int max_idx, idx; 6426 int i; 6427 6428 /* 6429 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6430 * vmcs12, regardless of whether or not the associated feature is 6431 * exposed to L1. Simply find the field with the highest index. 6432 */ 6433 max_idx = 0; 6434 for (i = 0; i < nr_vmcs12_fields; i++) { 6435 /* The vmcs12 table is very, very sparsely populated. */ 6436 if (!vmcs_field_to_offset_table[i]) 6437 continue; 6438 6439 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6440 if (idx > max_idx) 6441 max_idx = idx; 6442 } 6443 6444 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6445 } 6446 6447 /* 6448 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6449 * returned for the various VMX controls MSRs when nested VMX is enabled. 6450 * The same values should also be used to verify that vmcs12 control fields are 6451 * valid during nested entry from L1 to L2. 6452 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6453 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6454 * bit in the high half is on if the corresponding bit in the control field 6455 * may be on. See also vmx_control_verify(). 6456 */ 6457 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) 6458 { 6459 /* 6460 * Note that as a general rule, the high half of the MSRs (bits in 6461 * the control fields which may be 1) should be initialized by the 6462 * intersection of the underlying hardware's MSR (i.e., features which 6463 * can be supported) and the list of features we want to expose - 6464 * because they are known to be properly supported in our code. 6465 * Also, usually, the low half of the MSRs (bits which must be 1) can 6466 * be set to 0, meaning that L1 may turn off any of these bits. The 6467 * reason is that if one of these bits is necessary, it will appear 6468 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6469 * fields of vmcs01 and vmcs02, will turn these bits off - and 6470 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6471 * These rules have exceptions below. 6472 */ 6473 6474 /* pin-based controls */ 6475 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 6476 msrs->pinbased_ctls_low, 6477 msrs->pinbased_ctls_high); 6478 msrs->pinbased_ctls_low |= 6479 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6480 msrs->pinbased_ctls_high &= 6481 PIN_BASED_EXT_INTR_MASK | 6482 PIN_BASED_NMI_EXITING | 6483 PIN_BASED_VIRTUAL_NMIS | 6484 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6485 msrs->pinbased_ctls_high |= 6486 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6487 PIN_BASED_VMX_PREEMPTION_TIMER; 6488 6489 /* exit controls */ 6490 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 6491 msrs->exit_ctls_low, 6492 msrs->exit_ctls_high); 6493 msrs->exit_ctls_low = 6494 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6495 6496 msrs->exit_ctls_high &= 6497 #ifdef CONFIG_X86_64 6498 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6499 #endif 6500 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6501 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6502 msrs->exit_ctls_high |= 6503 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6504 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6505 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 6506 6507 /* We support free control of debug control saving. */ 6508 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6509 6510 /* entry controls */ 6511 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 6512 msrs->entry_ctls_low, 6513 msrs->entry_ctls_high); 6514 msrs->entry_ctls_low = 6515 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6516 msrs->entry_ctls_high &= 6517 #ifdef CONFIG_X86_64 6518 VM_ENTRY_IA32E_MODE | 6519 #endif 6520 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 6521 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 6522 msrs->entry_ctls_high |= 6523 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6524 6525 /* We support free control of debug control loading. */ 6526 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6527 6528 /* cpu-based controls */ 6529 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6530 msrs->procbased_ctls_low, 6531 msrs->procbased_ctls_high); 6532 msrs->procbased_ctls_low = 6533 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6534 msrs->procbased_ctls_high &= 6535 CPU_BASED_INTR_WINDOW_EXITING | 6536 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6537 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6538 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6539 CPU_BASED_CR3_STORE_EXITING | 6540 #ifdef CONFIG_X86_64 6541 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6542 #endif 6543 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6544 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6545 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6546 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6547 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6548 /* 6549 * We can allow some features even when not supported by the 6550 * hardware. For example, L1 can specify an MSR bitmap - and we 6551 * can use it to avoid exits to L1 - even when L0 runs L2 6552 * without MSR bitmaps. 6553 */ 6554 msrs->procbased_ctls_high |= 6555 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6556 CPU_BASED_USE_MSR_BITMAPS; 6557 6558 /* We support free control of CR3 access interception. */ 6559 msrs->procbased_ctls_low &= 6560 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6561 6562 /* 6563 * secondary cpu-based controls. Do not include those that 6564 * depend on CPUID bits, they are added later by 6565 * vmx_vcpu_after_set_cpuid. 6566 */ 6567 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6568 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6569 msrs->secondary_ctls_low, 6570 msrs->secondary_ctls_high); 6571 6572 msrs->secondary_ctls_low = 0; 6573 msrs->secondary_ctls_high &= 6574 SECONDARY_EXEC_DESC | 6575 SECONDARY_EXEC_ENABLE_RDTSCP | 6576 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6577 SECONDARY_EXEC_WBINVD_EXITING | 6578 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6579 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6580 SECONDARY_EXEC_RDRAND_EXITING | 6581 SECONDARY_EXEC_ENABLE_INVPCID | 6582 SECONDARY_EXEC_RDSEED_EXITING | 6583 SECONDARY_EXEC_XSAVES | 6584 SECONDARY_EXEC_TSC_SCALING; 6585 6586 /* 6587 * We can emulate "VMCS shadowing," even if the hardware 6588 * doesn't support it. 6589 */ 6590 msrs->secondary_ctls_high |= 6591 SECONDARY_EXEC_SHADOW_VMCS; 6592 6593 if (enable_ept) { 6594 /* nested EPT: emulate EPT also to L1 */ 6595 msrs->secondary_ctls_high |= 6596 SECONDARY_EXEC_ENABLE_EPT; 6597 msrs->ept_caps = 6598 VMX_EPT_PAGE_WALK_4_BIT | 6599 VMX_EPT_PAGE_WALK_5_BIT | 6600 VMX_EPTP_WB_BIT | 6601 VMX_EPT_INVEPT_BIT | 6602 VMX_EPT_EXECUTE_ONLY_BIT; 6603 6604 msrs->ept_caps &= ept_caps; 6605 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6606 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6607 VMX_EPT_1GB_PAGE_BIT; 6608 if (enable_ept_ad_bits) { 6609 msrs->secondary_ctls_high |= 6610 SECONDARY_EXEC_ENABLE_PML; 6611 msrs->ept_caps |= VMX_EPT_AD_BIT; 6612 } 6613 } 6614 6615 if (cpu_has_vmx_vmfunc()) { 6616 msrs->secondary_ctls_high |= 6617 SECONDARY_EXEC_ENABLE_VMFUNC; 6618 /* 6619 * Advertise EPTP switching unconditionally 6620 * since we emulate it 6621 */ 6622 if (enable_ept) 6623 msrs->vmfunc_controls = 6624 VMX_VMFUNC_EPTP_SWITCHING; 6625 } 6626 6627 /* 6628 * Old versions of KVM use the single-context version without 6629 * checking for support, so declare that it is supported even 6630 * though it is treated as global context. The alternative is 6631 * not failing the single-context invvpid, and it is worse. 6632 */ 6633 if (enable_vpid) { 6634 msrs->secondary_ctls_high |= 6635 SECONDARY_EXEC_ENABLE_VPID; 6636 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6637 VMX_VPID_EXTENT_SUPPORTED_MASK; 6638 } 6639 6640 if (enable_unrestricted_guest) 6641 msrs->secondary_ctls_high |= 6642 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6643 6644 if (flexpriority_enabled) 6645 msrs->secondary_ctls_high |= 6646 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6647 6648 if (enable_sgx) 6649 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6650 6651 /* miscellaneous data */ 6652 rdmsr(MSR_IA32_VMX_MISC, 6653 msrs->misc_low, 6654 msrs->misc_high); 6655 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6656 msrs->misc_low |= 6657 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6658 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6659 VMX_MISC_ACTIVITY_HLT | 6660 VMX_MISC_ACTIVITY_WAIT_SIPI; 6661 msrs->misc_high = 0; 6662 6663 /* 6664 * This MSR reports some information about VMX support. We 6665 * should return information about the VMX we emulate for the 6666 * guest, and the VMCS structure we give it - not about the 6667 * VMX support of the underlying hardware. 6668 */ 6669 msrs->basic = 6670 VMCS12_REVISION | 6671 VMX_BASIC_TRUE_CTLS | 6672 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6673 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6674 6675 if (cpu_has_vmx_basic_inout()) 6676 msrs->basic |= VMX_BASIC_INOUT; 6677 6678 /* 6679 * These MSRs specify bits which the guest must keep fixed on 6680 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6681 * We picked the standard core2 setting. 6682 */ 6683 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6684 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6685 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6686 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6687 6688 /* These MSRs specify bits which the guest must keep fixed off. */ 6689 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6690 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6691 6692 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 6693 } 6694 6695 void nested_vmx_hardware_unsetup(void) 6696 { 6697 int i; 6698 6699 if (enable_shadow_vmcs) { 6700 for (i = 0; i < VMX_BITMAP_NR; i++) 6701 free_page((unsigned long)vmx_bitmap[i]); 6702 } 6703 } 6704 6705 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6706 { 6707 int i; 6708 6709 if (!cpu_has_vmx_shadow_vmcs()) 6710 enable_shadow_vmcs = 0; 6711 if (enable_shadow_vmcs) { 6712 for (i = 0; i < VMX_BITMAP_NR; i++) { 6713 /* 6714 * The vmx_bitmap is not tied to a VM and so should 6715 * not be charged to a memcg. 6716 */ 6717 vmx_bitmap[i] = (unsigned long *) 6718 __get_free_page(GFP_KERNEL); 6719 if (!vmx_bitmap[i]) { 6720 nested_vmx_hardware_unsetup(); 6721 return -ENOMEM; 6722 } 6723 } 6724 6725 init_vmcs_shadow_fields(); 6726 } 6727 6728 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6729 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6730 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6731 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6732 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6733 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6734 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6735 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; 6736 exit_handlers[EXIT_REASON_VMON] = handle_vmon; 6737 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6738 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6739 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6740 6741 return 0; 6742 } 6743 6744 struct kvm_x86_nested_ops vmx_nested_ops = { 6745 .check_events = vmx_check_nested_events, 6746 .hv_timer_pending = nested_vmx_preemption_timer_pending, 6747 .triple_fault = nested_vmx_triple_fault, 6748 .get_state = vmx_get_nested_state, 6749 .set_state = vmx_set_nested_state, 6750 .get_nested_state_pages = vmx_get_nested_state_pages, 6751 .write_log_dirty = nested_vmx_write_pml_buffer, 6752 .enable_evmcs = nested_enable_evmcs, 6753 .get_evmcs_version = nested_get_evmcs_version, 6754 }; 6755