1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/objtool.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "pmu.h" 14 #include "sgx.h" 15 #include "trace.h" 16 #include "vmx.h" 17 #include "x86.h" 18 19 static bool __read_mostly enable_shadow_vmcs = 1; 20 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 21 22 static bool __read_mostly nested_early_check = 0; 23 module_param(nested_early_check, bool, S_IRUGO); 24 25 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 26 27 /* 28 * Hyper-V requires all of these, so mark them as supported even though 29 * they are just treated the same as all-context. 30 */ 31 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 32 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 33 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 34 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 36 37 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 38 39 enum { 40 VMX_VMREAD_BITMAP, 41 VMX_VMWRITE_BITMAP, 42 VMX_BITMAP_NR 43 }; 44 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 45 46 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 47 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 48 49 struct shadow_vmcs_field { 50 u16 encoding; 51 u16 offset; 52 }; 53 static struct shadow_vmcs_field shadow_read_only_fields[] = { 54 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 55 #include "vmcs_shadow_fields.h" 56 }; 57 static int max_shadow_read_only_fields = 58 ARRAY_SIZE(shadow_read_only_fields); 59 60 static struct shadow_vmcs_field shadow_read_write_fields[] = { 61 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 62 #include "vmcs_shadow_fields.h" 63 }; 64 static int max_shadow_read_write_fields = 65 ARRAY_SIZE(shadow_read_write_fields); 66 67 static void init_vmcs_shadow_fields(void) 68 { 69 int i, j; 70 71 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 72 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 73 74 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 75 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 76 u16 field = entry.encoding; 77 78 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 79 (i + 1 == max_shadow_read_only_fields || 80 shadow_read_only_fields[i + 1].encoding != field + 1)) 81 pr_err("Missing field from shadow_read_only_field %x\n", 82 field + 1); 83 84 clear_bit(field, vmx_vmread_bitmap); 85 if (field & 1) 86 #ifdef CONFIG_X86_64 87 continue; 88 #else 89 entry.offset += sizeof(u32); 90 #endif 91 shadow_read_only_fields[j++] = entry; 92 } 93 max_shadow_read_only_fields = j; 94 95 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 96 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 97 u16 field = entry.encoding; 98 99 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 100 (i + 1 == max_shadow_read_write_fields || 101 shadow_read_write_fields[i + 1].encoding != field + 1)) 102 pr_err("Missing field from shadow_read_write_field %x\n", 103 field + 1); 104 105 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 106 field <= GUEST_TR_AR_BYTES, 107 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 108 109 /* 110 * PML and the preemption timer can be emulated, but the 111 * processor cannot vmwrite to fields that don't exist 112 * on bare metal. 113 */ 114 switch (field) { 115 case GUEST_PML_INDEX: 116 if (!cpu_has_vmx_pml()) 117 continue; 118 break; 119 case VMX_PREEMPTION_TIMER_VALUE: 120 if (!cpu_has_vmx_preemption_timer()) 121 continue; 122 break; 123 case GUEST_INTR_STATUS: 124 if (!cpu_has_vmx_apicv()) 125 continue; 126 break; 127 default: 128 break; 129 } 130 131 clear_bit(field, vmx_vmwrite_bitmap); 132 clear_bit(field, vmx_vmread_bitmap); 133 if (field & 1) 134 #ifdef CONFIG_X86_64 135 continue; 136 #else 137 entry.offset += sizeof(u32); 138 #endif 139 shadow_read_write_fields[j++] = entry; 140 } 141 max_shadow_read_write_fields = j; 142 } 143 144 /* 145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 146 * set the success or error code of an emulated VMX instruction (as specified 147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 148 * instruction. 149 */ 150 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 151 { 152 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 153 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 154 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 155 return kvm_skip_emulated_instruction(vcpu); 156 } 157 158 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 162 X86_EFLAGS_SF | X86_EFLAGS_OF)) 163 | X86_EFLAGS_CF); 164 return kvm_skip_emulated_instruction(vcpu); 165 } 166 167 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 168 u32 vm_instruction_error) 169 { 170 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 171 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 172 X86_EFLAGS_SF | X86_EFLAGS_OF)) 173 | X86_EFLAGS_ZF); 174 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 175 /* 176 * We don't need to force sync to shadow VMCS because 177 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 178 * fields and thus must be synced. 179 */ 180 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 181 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 182 183 return kvm_skip_emulated_instruction(vcpu); 184 } 185 186 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 187 { 188 struct vcpu_vmx *vmx = to_vmx(vcpu); 189 190 /* 191 * failValid writes the error number to the current VMCS, which 192 * can't be done if there isn't a current VMCS. 193 */ 194 if (vmx->nested.current_vmptr == INVALID_GPA && 195 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 196 return nested_vmx_failInvalid(vcpu); 197 198 return nested_vmx_failValid(vcpu, vm_instruction_error); 199 } 200 201 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 202 { 203 /* TODO: not to reset guest simply here. */ 204 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 205 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 206 } 207 208 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 209 { 210 return fixed_bits_valid(control, low, high); 211 } 212 213 static inline u64 vmx_control_msr(u32 low, u32 high) 214 { 215 return low | ((u64)high << 32); 216 } 217 218 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 219 { 220 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 221 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 222 vmx->nested.need_vmcs12_to_shadow_sync = false; 223 } 224 225 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 226 { 227 struct vcpu_vmx *vmx = to_vmx(vcpu); 228 229 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 230 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 231 vmx->nested.hv_evmcs = NULL; 232 } 233 234 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 235 } 236 237 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 238 struct loaded_vmcs *prev) 239 { 240 struct vmcs_host_state *dest, *src; 241 242 if (unlikely(!vmx->guest_state_loaded)) 243 return; 244 245 src = &prev->host_state; 246 dest = &vmx->loaded_vmcs->host_state; 247 248 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 249 dest->ldt_sel = src->ldt_sel; 250 #ifdef CONFIG_X86_64 251 dest->ds_sel = src->ds_sel; 252 dest->es_sel = src->es_sel; 253 #endif 254 } 255 256 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 257 { 258 struct vcpu_vmx *vmx = to_vmx(vcpu); 259 struct loaded_vmcs *prev; 260 int cpu; 261 262 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 263 return; 264 265 cpu = get_cpu(); 266 prev = vmx->loaded_vmcs; 267 vmx->loaded_vmcs = vmcs; 268 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 269 vmx_sync_vmcs_host_state(vmx, prev); 270 put_cpu(); 271 272 vmx_register_cache_reset(vcpu); 273 } 274 275 /* 276 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 277 * just stops using VMX. 278 */ 279 static void free_nested(struct kvm_vcpu *vcpu) 280 { 281 struct vcpu_vmx *vmx = to_vmx(vcpu); 282 283 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 284 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 285 286 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 287 return; 288 289 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 290 291 vmx->nested.vmxon = false; 292 vmx->nested.smm.vmxon = false; 293 vmx->nested.vmxon_ptr = INVALID_GPA; 294 free_vpid(vmx->nested.vpid02); 295 vmx->nested.posted_intr_nv = -1; 296 vmx->nested.current_vmptr = INVALID_GPA; 297 if (enable_shadow_vmcs) { 298 vmx_disable_shadow_vmcs(vmx); 299 vmcs_clear(vmx->vmcs01.shadow_vmcs); 300 free_vmcs(vmx->vmcs01.shadow_vmcs); 301 vmx->vmcs01.shadow_vmcs = NULL; 302 } 303 kfree(vmx->nested.cached_vmcs12); 304 vmx->nested.cached_vmcs12 = NULL; 305 kfree(vmx->nested.cached_shadow_vmcs12); 306 vmx->nested.cached_shadow_vmcs12 = NULL; 307 /* Unpin physical memory we referred to in the vmcs02 */ 308 if (vmx->nested.apic_access_page) { 309 kvm_release_page_clean(vmx->nested.apic_access_page); 310 vmx->nested.apic_access_page = NULL; 311 } 312 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 313 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 314 vmx->nested.pi_desc = NULL; 315 316 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 317 318 nested_release_evmcs(vcpu); 319 320 free_loaded_vmcs(&vmx->nested.vmcs02); 321 } 322 323 /* 324 * Ensure that the current vmcs of the logical processor is the 325 * vmcs01 of the vcpu before calling free_nested(). 326 */ 327 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 328 { 329 vcpu_load(vcpu); 330 vmx_leave_nested(vcpu); 331 vcpu_put(vcpu); 332 } 333 334 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 335 336 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 337 { 338 return VALID_PAGE(root_hpa) && 339 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 340 } 341 342 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 343 gpa_t addr) 344 { 345 uint i; 346 struct kvm_mmu_root_info *cached_root; 347 348 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 349 350 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 351 cached_root = &vcpu->arch.mmu->prev_roots[i]; 352 353 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 354 eptp)) 355 vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); 356 } 357 } 358 359 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 360 struct x86_exception *fault) 361 { 362 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 363 struct vcpu_vmx *vmx = to_vmx(vcpu); 364 u32 vm_exit_reason; 365 unsigned long exit_qualification = vcpu->arch.exit_qualification; 366 367 if (vmx->nested.pml_full) { 368 vm_exit_reason = EXIT_REASON_PML_FULL; 369 vmx->nested.pml_full = false; 370 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 371 } else { 372 if (fault->error_code & PFERR_RSVD_MASK) 373 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 374 else 375 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 376 377 /* 378 * Although the caller (kvm_inject_emulated_page_fault) would 379 * have already synced the faulting address in the shadow EPT 380 * tables for the current EPTP12, we also need to sync it for 381 * any other cached EPTP02s based on the same EP4TA, since the 382 * TLB associates mappings to the EP4TA rather than the full EPTP. 383 */ 384 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 385 fault->address); 386 } 387 388 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 389 vmcs12->guest_physical_address = fault->address; 390 } 391 392 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 393 { 394 kvm_init_shadow_ept_mmu(vcpu, 395 to_vmx(vcpu)->nested.msrs.ept_caps & 396 VMX_EPT_EXECUTE_ONLY_BIT, 397 nested_ept_ad_enabled(vcpu), 398 nested_ept_get_eptp(vcpu)); 399 } 400 401 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 402 { 403 WARN_ON(mmu_is_nested(vcpu)); 404 405 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 406 nested_ept_new_eptp(vcpu); 407 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 408 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 409 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 410 411 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 412 } 413 414 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 415 { 416 vcpu->arch.mmu = &vcpu->arch.root_mmu; 417 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 418 } 419 420 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 421 u16 error_code) 422 { 423 bool inequality, bit; 424 425 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 426 inequality = 427 (error_code & vmcs12->page_fault_error_code_mask) != 428 vmcs12->page_fault_error_code_match; 429 return inequality ^ bit; 430 } 431 432 433 /* 434 * KVM wants to inject page-faults which it got to the guest. This function 435 * checks whether in a nested guest, we need to inject them to L1 or L2. 436 */ 437 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 438 { 439 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 440 unsigned int nr = vcpu->arch.exception.nr; 441 bool has_payload = vcpu->arch.exception.has_payload; 442 unsigned long payload = vcpu->arch.exception.payload; 443 444 if (nr == PF_VECTOR) { 445 if (vcpu->arch.exception.nested_apf) { 446 *exit_qual = vcpu->arch.apf.nested_apf_token; 447 return 1; 448 } 449 if (nested_vmx_is_page_fault_vmexit(vmcs12, 450 vcpu->arch.exception.error_code)) { 451 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 452 return 1; 453 } 454 } else if (vmcs12->exception_bitmap & (1u << nr)) { 455 if (nr == DB_VECTOR) { 456 if (!has_payload) { 457 payload = vcpu->arch.dr6; 458 payload &= ~DR6_BT; 459 payload ^= DR6_ACTIVE_LOW; 460 } 461 *exit_qual = payload; 462 } else 463 *exit_qual = 0; 464 return 1; 465 } 466 467 return 0; 468 } 469 470 471 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 472 struct x86_exception *fault) 473 { 474 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 475 476 WARN_ON(!is_guest_mode(vcpu)); 477 478 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 479 !to_vmx(vcpu)->nested.nested_run_pending) { 480 vmcs12->vm_exit_intr_error_code = fault->error_code; 481 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 482 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 483 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 484 fault->address); 485 } else { 486 kvm_inject_page_fault(vcpu, fault); 487 } 488 } 489 490 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 491 struct vmcs12 *vmcs12) 492 { 493 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 494 return 0; 495 496 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 497 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 498 return -EINVAL; 499 500 return 0; 501 } 502 503 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 504 struct vmcs12 *vmcs12) 505 { 506 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 507 return 0; 508 509 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 510 return -EINVAL; 511 512 return 0; 513 } 514 515 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 516 struct vmcs12 *vmcs12) 517 { 518 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 519 return 0; 520 521 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 522 return -EINVAL; 523 524 return 0; 525 } 526 527 /* 528 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 529 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 530 * only the "disable intercept" case needs to be handled. 531 */ 532 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 533 unsigned long *msr_bitmap_l0, 534 u32 msr, int type) 535 { 536 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 537 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 538 539 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 540 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 541 } 542 543 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 544 { 545 int msr; 546 547 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 548 unsigned word = msr / BITS_PER_LONG; 549 550 msr_bitmap[word] = ~0; 551 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 552 } 553 } 554 555 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 556 static inline \ 557 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 558 unsigned long *msr_bitmap_l1, \ 559 unsigned long *msr_bitmap_l0, u32 msr) \ 560 { \ 561 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 562 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 563 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 564 else \ 565 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 566 } 567 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 568 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 569 570 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 571 unsigned long *msr_bitmap_l1, 572 unsigned long *msr_bitmap_l0, 573 u32 msr, int types) 574 { 575 if (types & MSR_TYPE_R) 576 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 577 msr_bitmap_l0, msr); 578 if (types & MSR_TYPE_W) 579 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 580 msr_bitmap_l0, msr); 581 } 582 583 /* 584 * Merge L0's and L1's MSR bitmap, return false to indicate that 585 * we do not use the hardware. 586 */ 587 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 588 struct vmcs12 *vmcs12) 589 { 590 struct vcpu_vmx *vmx = to_vmx(vcpu); 591 int msr; 592 unsigned long *msr_bitmap_l1; 593 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 594 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 595 596 /* Nothing to do if the MSR bitmap is not in use. */ 597 if (!cpu_has_vmx_msr_bitmap() || 598 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 599 return false; 600 601 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 602 return false; 603 604 msr_bitmap_l1 = (unsigned long *)map->hva; 605 606 /* 607 * To keep the control flow simple, pay eight 8-byte writes (sixteen 608 * 4-byte writes on 32-bit systems) up front to enable intercepts for 609 * the x2APIC MSR range and selectively toggle those relevant to L2. 610 */ 611 enable_x2apic_msr_intercepts(msr_bitmap_l0); 612 613 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 614 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 615 /* 616 * L0 need not intercept reads for MSRs between 0x800 617 * and 0x8ff, it just lets the processor take the value 618 * from the virtual-APIC page; take those 256 bits 619 * directly from the L1 bitmap. 620 */ 621 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 622 unsigned word = msr / BITS_PER_LONG; 623 624 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 625 } 626 } 627 628 nested_vmx_disable_intercept_for_x2apic_msr( 629 msr_bitmap_l1, msr_bitmap_l0, 630 X2APIC_MSR(APIC_TASKPRI), 631 MSR_TYPE_R | MSR_TYPE_W); 632 633 if (nested_cpu_has_vid(vmcs12)) { 634 nested_vmx_disable_intercept_for_x2apic_msr( 635 msr_bitmap_l1, msr_bitmap_l0, 636 X2APIC_MSR(APIC_EOI), 637 MSR_TYPE_W); 638 nested_vmx_disable_intercept_for_x2apic_msr( 639 msr_bitmap_l1, msr_bitmap_l0, 640 X2APIC_MSR(APIC_SELF_IPI), 641 MSR_TYPE_W); 642 } 643 } 644 645 /* 646 * Always check vmcs01's bitmap to honor userspace MSR filters and any 647 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 648 */ 649 #ifdef CONFIG_X86_64 650 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 651 MSR_FS_BASE, MSR_TYPE_RW); 652 653 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 654 MSR_GS_BASE, MSR_TYPE_RW); 655 656 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 657 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 658 #endif 659 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 660 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 661 662 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 663 MSR_IA32_PRED_CMD, MSR_TYPE_W); 664 665 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 666 667 return true; 668 } 669 670 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 671 struct vmcs12 *vmcs12) 672 { 673 struct kvm_host_map map; 674 struct vmcs12 *shadow; 675 676 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 677 vmcs12->vmcs_link_pointer == INVALID_GPA) 678 return; 679 680 shadow = get_shadow_vmcs12(vcpu); 681 682 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 683 return; 684 685 memcpy(shadow, map.hva, VMCS12_SIZE); 686 kvm_vcpu_unmap(vcpu, &map, false); 687 } 688 689 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 690 struct vmcs12 *vmcs12) 691 { 692 struct vcpu_vmx *vmx = to_vmx(vcpu); 693 694 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 695 vmcs12->vmcs_link_pointer == INVALID_GPA) 696 return; 697 698 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 699 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 700 } 701 702 /* 703 * In nested virtualization, check if L1 has set 704 * VM_EXIT_ACK_INTR_ON_EXIT 705 */ 706 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 707 { 708 return get_vmcs12(vcpu)->vm_exit_controls & 709 VM_EXIT_ACK_INTR_ON_EXIT; 710 } 711 712 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 713 struct vmcs12 *vmcs12) 714 { 715 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 716 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 717 return -EINVAL; 718 else 719 return 0; 720 } 721 722 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 723 struct vmcs12 *vmcs12) 724 { 725 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 726 !nested_cpu_has_apic_reg_virt(vmcs12) && 727 !nested_cpu_has_vid(vmcs12) && 728 !nested_cpu_has_posted_intr(vmcs12)) 729 return 0; 730 731 /* 732 * If virtualize x2apic mode is enabled, 733 * virtualize apic access must be disabled. 734 */ 735 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 736 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 737 return -EINVAL; 738 739 /* 740 * If virtual interrupt delivery is enabled, 741 * we must exit on external interrupts. 742 */ 743 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 744 return -EINVAL; 745 746 /* 747 * bits 15:8 should be zero in posted_intr_nv, 748 * the descriptor address has been already checked 749 * in nested_get_vmcs12_pages. 750 * 751 * bits 5:0 of posted_intr_desc_addr should be zero. 752 */ 753 if (nested_cpu_has_posted_intr(vmcs12) && 754 (CC(!nested_cpu_has_vid(vmcs12)) || 755 CC(!nested_exit_intr_ack_set(vcpu)) || 756 CC((vmcs12->posted_intr_nv & 0xff00)) || 757 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 758 return -EINVAL; 759 760 /* tpr shadow is needed by all apicv features. */ 761 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 762 return -EINVAL; 763 764 return 0; 765 } 766 767 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 768 u32 count, u64 addr) 769 { 770 if (count == 0) 771 return 0; 772 773 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 774 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 775 return -EINVAL; 776 777 return 0; 778 } 779 780 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 781 struct vmcs12 *vmcs12) 782 { 783 if (CC(nested_vmx_check_msr_switch(vcpu, 784 vmcs12->vm_exit_msr_load_count, 785 vmcs12->vm_exit_msr_load_addr)) || 786 CC(nested_vmx_check_msr_switch(vcpu, 787 vmcs12->vm_exit_msr_store_count, 788 vmcs12->vm_exit_msr_store_addr))) 789 return -EINVAL; 790 791 return 0; 792 } 793 794 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 795 struct vmcs12 *vmcs12) 796 { 797 if (CC(nested_vmx_check_msr_switch(vcpu, 798 vmcs12->vm_entry_msr_load_count, 799 vmcs12->vm_entry_msr_load_addr))) 800 return -EINVAL; 801 802 return 0; 803 } 804 805 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 806 struct vmcs12 *vmcs12) 807 { 808 if (!nested_cpu_has_pml(vmcs12)) 809 return 0; 810 811 if (CC(!nested_cpu_has_ept(vmcs12)) || 812 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 813 return -EINVAL; 814 815 return 0; 816 } 817 818 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 819 struct vmcs12 *vmcs12) 820 { 821 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 822 !nested_cpu_has_ept(vmcs12))) 823 return -EINVAL; 824 return 0; 825 } 826 827 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 828 struct vmcs12 *vmcs12) 829 { 830 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 831 !nested_cpu_has_ept(vmcs12))) 832 return -EINVAL; 833 return 0; 834 } 835 836 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 837 struct vmcs12 *vmcs12) 838 { 839 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 840 return 0; 841 842 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 843 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 844 return -EINVAL; 845 846 return 0; 847 } 848 849 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 850 struct vmx_msr_entry *e) 851 { 852 /* x2APIC MSR accesses are not allowed */ 853 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 854 return -EINVAL; 855 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 856 CC(e->index == MSR_IA32_UCODE_REV)) 857 return -EINVAL; 858 if (CC(e->reserved != 0)) 859 return -EINVAL; 860 return 0; 861 } 862 863 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 864 struct vmx_msr_entry *e) 865 { 866 if (CC(e->index == MSR_FS_BASE) || 867 CC(e->index == MSR_GS_BASE) || 868 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 869 nested_vmx_msr_check_common(vcpu, e)) 870 return -EINVAL; 871 return 0; 872 } 873 874 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 875 struct vmx_msr_entry *e) 876 { 877 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 878 nested_vmx_msr_check_common(vcpu, e)) 879 return -EINVAL; 880 return 0; 881 } 882 883 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 884 { 885 struct vcpu_vmx *vmx = to_vmx(vcpu); 886 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 887 vmx->nested.msrs.misc_high); 888 889 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 890 } 891 892 /* 893 * Load guest's/host's msr at nested entry/exit. 894 * return 0 for success, entry index for failure. 895 * 896 * One of the failure modes for MSR load/store is when a list exceeds the 897 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 898 * as possible, process all valid entries before failing rather than precheck 899 * for a capacity violation. 900 */ 901 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 902 { 903 u32 i; 904 struct vmx_msr_entry e; 905 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 906 907 for (i = 0; i < count; i++) { 908 if (unlikely(i >= max_msr_list_size)) 909 goto fail; 910 911 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 912 &e, sizeof(e))) { 913 pr_debug_ratelimited( 914 "%s cannot read MSR entry (%u, 0x%08llx)\n", 915 __func__, i, gpa + i * sizeof(e)); 916 goto fail; 917 } 918 if (nested_vmx_load_msr_check(vcpu, &e)) { 919 pr_debug_ratelimited( 920 "%s check failed (%u, 0x%x, 0x%x)\n", 921 __func__, i, e.index, e.reserved); 922 goto fail; 923 } 924 if (kvm_set_msr(vcpu, e.index, e.value)) { 925 pr_debug_ratelimited( 926 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 927 __func__, i, e.index, e.value); 928 goto fail; 929 } 930 } 931 return 0; 932 fail: 933 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 934 return i + 1; 935 } 936 937 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 938 u32 msr_index, 939 u64 *data) 940 { 941 struct vcpu_vmx *vmx = to_vmx(vcpu); 942 943 /* 944 * If the L0 hypervisor stored a more accurate value for the TSC that 945 * does not include the time taken for emulation of the L2->L1 946 * VM-exit in L0, use the more accurate value. 947 */ 948 if (msr_index == MSR_IA32_TSC) { 949 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 950 MSR_IA32_TSC); 951 952 if (i >= 0) { 953 u64 val = vmx->msr_autostore.guest.val[i].value; 954 955 *data = kvm_read_l1_tsc(vcpu, val); 956 return true; 957 } 958 } 959 960 if (kvm_get_msr(vcpu, msr_index, data)) { 961 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 962 msr_index); 963 return false; 964 } 965 return true; 966 } 967 968 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 969 struct vmx_msr_entry *e) 970 { 971 if (kvm_vcpu_read_guest(vcpu, 972 gpa + i * sizeof(*e), 973 e, 2 * sizeof(u32))) { 974 pr_debug_ratelimited( 975 "%s cannot read MSR entry (%u, 0x%08llx)\n", 976 __func__, i, gpa + i * sizeof(*e)); 977 return false; 978 } 979 if (nested_vmx_store_msr_check(vcpu, e)) { 980 pr_debug_ratelimited( 981 "%s check failed (%u, 0x%x, 0x%x)\n", 982 __func__, i, e->index, e->reserved); 983 return false; 984 } 985 return true; 986 } 987 988 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 989 { 990 u64 data; 991 u32 i; 992 struct vmx_msr_entry e; 993 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 994 995 for (i = 0; i < count; i++) { 996 if (unlikely(i >= max_msr_list_size)) 997 return -EINVAL; 998 999 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1000 return -EINVAL; 1001 1002 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1003 return -EINVAL; 1004 1005 if (kvm_vcpu_write_guest(vcpu, 1006 gpa + i * sizeof(e) + 1007 offsetof(struct vmx_msr_entry, value), 1008 &data, sizeof(data))) { 1009 pr_debug_ratelimited( 1010 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1011 __func__, i, e.index, data); 1012 return -EINVAL; 1013 } 1014 } 1015 return 0; 1016 } 1017 1018 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1019 { 1020 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1021 u32 count = vmcs12->vm_exit_msr_store_count; 1022 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1023 struct vmx_msr_entry e; 1024 u32 i; 1025 1026 for (i = 0; i < count; i++) { 1027 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1028 return false; 1029 1030 if (e.index == msr_index) 1031 return true; 1032 } 1033 return false; 1034 } 1035 1036 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1037 u32 msr_index) 1038 { 1039 struct vcpu_vmx *vmx = to_vmx(vcpu); 1040 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1041 bool in_vmcs12_store_list; 1042 int msr_autostore_slot; 1043 bool in_autostore_list; 1044 int last; 1045 1046 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1047 in_autostore_list = msr_autostore_slot >= 0; 1048 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1049 1050 if (in_vmcs12_store_list && !in_autostore_list) { 1051 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1052 /* 1053 * Emulated VMEntry does not fail here. Instead a less 1054 * accurate value will be returned by 1055 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1056 * instead of reading the value from the vmcs02 VMExit 1057 * MSR-store area. 1058 */ 1059 pr_warn_ratelimited( 1060 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1061 msr_index); 1062 return; 1063 } 1064 last = autostore->nr++; 1065 autostore->val[last].index = msr_index; 1066 } else if (!in_vmcs12_store_list && in_autostore_list) { 1067 last = --autostore->nr; 1068 autostore->val[msr_autostore_slot] = autostore->val[last]; 1069 } 1070 } 1071 1072 /* 1073 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1074 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1075 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1076 * @entry_failure_code. 1077 */ 1078 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1079 bool nested_ept, bool reload_pdptrs, 1080 enum vm_entry_failure_code *entry_failure_code) 1081 { 1082 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1083 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1084 return -EINVAL; 1085 } 1086 1087 /* 1088 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1089 * must not be dereferenced. 1090 */ 1091 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1092 CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1093 *entry_failure_code = ENTRY_FAIL_PDPTE; 1094 return -EINVAL; 1095 } 1096 1097 if (!nested_ept) 1098 kvm_mmu_new_pgd(vcpu, cr3); 1099 1100 vcpu->arch.cr3 = cr3; 1101 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1102 1103 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1104 kvm_init_mmu(vcpu); 1105 1106 return 0; 1107 } 1108 1109 /* 1110 * Returns if KVM is able to config CPU to tag TLB entries 1111 * populated by L2 differently than TLB entries populated 1112 * by L1. 1113 * 1114 * If L0 uses EPT, L1 and L2 run with different EPTP because 1115 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1116 * are tagged with different EPTP. 1117 * 1118 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1119 * with different VPID (L1 entries are tagged with vmx->vpid 1120 * while L2 entries are tagged with vmx->nested.vpid02). 1121 */ 1122 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1123 { 1124 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1125 1126 return enable_ept || 1127 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1128 } 1129 1130 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1131 struct vmcs12 *vmcs12, 1132 bool is_vmenter) 1133 { 1134 struct vcpu_vmx *vmx = to_vmx(vcpu); 1135 1136 /* 1137 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1138 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1139 * full TLB flush from the guest's perspective. This is required even 1140 * if VPID is disabled in the host as KVM may need to synchronize the 1141 * MMU in response to the guest TLB flush. 1142 * 1143 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1144 * EPT is a special snowflake, as guest-physical mappings aren't 1145 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1146 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1147 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1148 * those mappings. 1149 */ 1150 if (!nested_cpu_has_vpid(vmcs12)) { 1151 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1152 return; 1153 } 1154 1155 /* L2 should never have a VPID if VPID is disabled. */ 1156 WARN_ON(!enable_vpid); 1157 1158 /* 1159 * If VPID is enabled and used by vmc12, but L2 does not have a unique 1160 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate 1161 * a VPID for L2, flush the current context as the effective ASID is 1162 * common to both L1 and L2. 1163 * 1164 * Defer the flush so that it runs after vmcs02.EPTP has been set by 1165 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid 1166 * redundant flushes further down the nested pipeline. 1167 * 1168 * If a TLB flush isn't required due to any of the above, and vpid12 is 1169 * changing then the new "virtual" VPID (vpid12) will reuse the same 1170 * "real" VPID (vpid02), and so needs to be flushed. There's no direct 1171 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for 1172 * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate 1173 * guest-physical mappings, so there is no need to sync the nEPT MMU. 1174 */ 1175 if (!nested_has_guest_tlb_tag(vcpu)) { 1176 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1177 } else if (is_vmenter && 1178 vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1179 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1180 vpid_sync_context(nested_get_vpid02(vcpu)); 1181 } 1182 } 1183 1184 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1185 { 1186 superset &= mask; 1187 subset &= mask; 1188 1189 return (superset | subset) == superset; 1190 } 1191 1192 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1193 { 1194 const u64 feature_and_reserved = 1195 /* feature (except bit 48; see below) */ 1196 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1197 /* reserved */ 1198 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1199 u64 vmx_basic = vmx->nested.msrs.basic; 1200 1201 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1202 return -EINVAL; 1203 1204 /* 1205 * KVM does not emulate a version of VMX that constrains physical 1206 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1207 */ 1208 if (data & BIT_ULL(48)) 1209 return -EINVAL; 1210 1211 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1212 vmx_basic_vmcs_revision_id(data)) 1213 return -EINVAL; 1214 1215 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1216 return -EINVAL; 1217 1218 vmx->nested.msrs.basic = data; 1219 return 0; 1220 } 1221 1222 static int 1223 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1224 { 1225 u64 supported; 1226 u32 *lowp, *highp; 1227 1228 switch (msr_index) { 1229 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1230 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1231 highp = &vmx->nested.msrs.pinbased_ctls_high; 1232 break; 1233 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1234 lowp = &vmx->nested.msrs.procbased_ctls_low; 1235 highp = &vmx->nested.msrs.procbased_ctls_high; 1236 break; 1237 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1238 lowp = &vmx->nested.msrs.exit_ctls_low; 1239 highp = &vmx->nested.msrs.exit_ctls_high; 1240 break; 1241 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1242 lowp = &vmx->nested.msrs.entry_ctls_low; 1243 highp = &vmx->nested.msrs.entry_ctls_high; 1244 break; 1245 case MSR_IA32_VMX_PROCBASED_CTLS2: 1246 lowp = &vmx->nested.msrs.secondary_ctls_low; 1247 highp = &vmx->nested.msrs.secondary_ctls_high; 1248 break; 1249 default: 1250 BUG(); 1251 } 1252 1253 supported = vmx_control_msr(*lowp, *highp); 1254 1255 /* Check must-be-1 bits are still 1. */ 1256 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1257 return -EINVAL; 1258 1259 /* Check must-be-0 bits are still 0. */ 1260 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1261 return -EINVAL; 1262 1263 *lowp = data; 1264 *highp = data >> 32; 1265 return 0; 1266 } 1267 1268 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1269 { 1270 const u64 feature_and_reserved_bits = 1271 /* feature */ 1272 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1273 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1274 /* reserved */ 1275 GENMASK_ULL(13, 9) | BIT_ULL(31); 1276 u64 vmx_misc; 1277 1278 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1279 vmx->nested.msrs.misc_high); 1280 1281 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1282 return -EINVAL; 1283 1284 if ((vmx->nested.msrs.pinbased_ctls_high & 1285 PIN_BASED_VMX_PREEMPTION_TIMER) && 1286 vmx_misc_preemption_timer_rate(data) != 1287 vmx_misc_preemption_timer_rate(vmx_misc)) 1288 return -EINVAL; 1289 1290 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1291 return -EINVAL; 1292 1293 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1294 return -EINVAL; 1295 1296 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1297 return -EINVAL; 1298 1299 vmx->nested.msrs.misc_low = data; 1300 vmx->nested.msrs.misc_high = data >> 32; 1301 1302 return 0; 1303 } 1304 1305 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1306 { 1307 u64 vmx_ept_vpid_cap; 1308 1309 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1310 vmx->nested.msrs.vpid_caps); 1311 1312 /* Every bit is either reserved or a feature bit. */ 1313 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1314 return -EINVAL; 1315 1316 vmx->nested.msrs.ept_caps = data; 1317 vmx->nested.msrs.vpid_caps = data >> 32; 1318 return 0; 1319 } 1320 1321 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1322 { 1323 u64 *msr; 1324 1325 switch (msr_index) { 1326 case MSR_IA32_VMX_CR0_FIXED0: 1327 msr = &vmx->nested.msrs.cr0_fixed0; 1328 break; 1329 case MSR_IA32_VMX_CR4_FIXED0: 1330 msr = &vmx->nested.msrs.cr4_fixed0; 1331 break; 1332 default: 1333 BUG(); 1334 } 1335 1336 /* 1337 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1338 * must be 1 in the restored value. 1339 */ 1340 if (!is_bitwise_subset(data, *msr, -1ULL)) 1341 return -EINVAL; 1342 1343 *msr = data; 1344 return 0; 1345 } 1346 1347 /* 1348 * Called when userspace is restoring VMX MSRs. 1349 * 1350 * Returns 0 on success, non-0 otherwise. 1351 */ 1352 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1353 { 1354 struct vcpu_vmx *vmx = to_vmx(vcpu); 1355 1356 /* 1357 * Don't allow changes to the VMX capability MSRs while the vCPU 1358 * is in VMX operation. 1359 */ 1360 if (vmx->nested.vmxon) 1361 return -EBUSY; 1362 1363 switch (msr_index) { 1364 case MSR_IA32_VMX_BASIC: 1365 return vmx_restore_vmx_basic(vmx, data); 1366 case MSR_IA32_VMX_PINBASED_CTLS: 1367 case MSR_IA32_VMX_PROCBASED_CTLS: 1368 case MSR_IA32_VMX_EXIT_CTLS: 1369 case MSR_IA32_VMX_ENTRY_CTLS: 1370 /* 1371 * The "non-true" VMX capability MSRs are generated from the 1372 * "true" MSRs, so we do not support restoring them directly. 1373 * 1374 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1375 * should restore the "true" MSRs with the must-be-1 bits 1376 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1377 * DEFAULT SETTINGS". 1378 */ 1379 return -EINVAL; 1380 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1381 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1382 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1383 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1384 case MSR_IA32_VMX_PROCBASED_CTLS2: 1385 return vmx_restore_control_msr(vmx, msr_index, data); 1386 case MSR_IA32_VMX_MISC: 1387 return vmx_restore_vmx_misc(vmx, data); 1388 case MSR_IA32_VMX_CR0_FIXED0: 1389 case MSR_IA32_VMX_CR4_FIXED0: 1390 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1391 case MSR_IA32_VMX_CR0_FIXED1: 1392 case MSR_IA32_VMX_CR4_FIXED1: 1393 /* 1394 * These MSRs are generated based on the vCPU's CPUID, so we 1395 * do not support restoring them directly. 1396 */ 1397 return -EINVAL; 1398 case MSR_IA32_VMX_EPT_VPID_CAP: 1399 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1400 case MSR_IA32_VMX_VMCS_ENUM: 1401 vmx->nested.msrs.vmcs_enum = data; 1402 return 0; 1403 case MSR_IA32_VMX_VMFUNC: 1404 if (data & ~vmx->nested.msrs.vmfunc_controls) 1405 return -EINVAL; 1406 vmx->nested.msrs.vmfunc_controls = data; 1407 return 0; 1408 default: 1409 /* 1410 * The rest of the VMX capability MSRs do not support restore. 1411 */ 1412 return -EINVAL; 1413 } 1414 } 1415 1416 /* Returns 0 on success, non-0 otherwise. */ 1417 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1418 { 1419 switch (msr_index) { 1420 case MSR_IA32_VMX_BASIC: 1421 *pdata = msrs->basic; 1422 break; 1423 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1424 case MSR_IA32_VMX_PINBASED_CTLS: 1425 *pdata = vmx_control_msr( 1426 msrs->pinbased_ctls_low, 1427 msrs->pinbased_ctls_high); 1428 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1429 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1430 break; 1431 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1432 case MSR_IA32_VMX_PROCBASED_CTLS: 1433 *pdata = vmx_control_msr( 1434 msrs->procbased_ctls_low, 1435 msrs->procbased_ctls_high); 1436 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1437 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1438 break; 1439 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1440 case MSR_IA32_VMX_EXIT_CTLS: 1441 *pdata = vmx_control_msr( 1442 msrs->exit_ctls_low, 1443 msrs->exit_ctls_high); 1444 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1445 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1446 break; 1447 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1448 case MSR_IA32_VMX_ENTRY_CTLS: 1449 *pdata = vmx_control_msr( 1450 msrs->entry_ctls_low, 1451 msrs->entry_ctls_high); 1452 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1453 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1454 break; 1455 case MSR_IA32_VMX_MISC: 1456 *pdata = vmx_control_msr( 1457 msrs->misc_low, 1458 msrs->misc_high); 1459 break; 1460 case MSR_IA32_VMX_CR0_FIXED0: 1461 *pdata = msrs->cr0_fixed0; 1462 break; 1463 case MSR_IA32_VMX_CR0_FIXED1: 1464 *pdata = msrs->cr0_fixed1; 1465 break; 1466 case MSR_IA32_VMX_CR4_FIXED0: 1467 *pdata = msrs->cr4_fixed0; 1468 break; 1469 case MSR_IA32_VMX_CR4_FIXED1: 1470 *pdata = msrs->cr4_fixed1; 1471 break; 1472 case MSR_IA32_VMX_VMCS_ENUM: 1473 *pdata = msrs->vmcs_enum; 1474 break; 1475 case MSR_IA32_VMX_PROCBASED_CTLS2: 1476 *pdata = vmx_control_msr( 1477 msrs->secondary_ctls_low, 1478 msrs->secondary_ctls_high); 1479 break; 1480 case MSR_IA32_VMX_EPT_VPID_CAP: 1481 *pdata = msrs->ept_caps | 1482 ((u64)msrs->vpid_caps << 32); 1483 break; 1484 case MSR_IA32_VMX_VMFUNC: 1485 *pdata = msrs->vmfunc_controls; 1486 break; 1487 default: 1488 return 1; 1489 } 1490 1491 return 0; 1492 } 1493 1494 /* 1495 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1496 * been modified by the L1 guest. Note, "writable" in this context means 1497 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1498 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1499 * VM-exit information fields (which are actually writable if the vCPU is 1500 * configured to support "VMWRITE to any supported field in the VMCS"). 1501 */ 1502 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1503 { 1504 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1505 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1506 struct shadow_vmcs_field field; 1507 unsigned long val; 1508 int i; 1509 1510 if (WARN_ON(!shadow_vmcs)) 1511 return; 1512 1513 preempt_disable(); 1514 1515 vmcs_load(shadow_vmcs); 1516 1517 for (i = 0; i < max_shadow_read_write_fields; i++) { 1518 field = shadow_read_write_fields[i]; 1519 val = __vmcs_readl(field.encoding); 1520 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1521 } 1522 1523 vmcs_clear(shadow_vmcs); 1524 vmcs_load(vmx->loaded_vmcs->vmcs); 1525 1526 preempt_enable(); 1527 } 1528 1529 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1530 { 1531 const struct shadow_vmcs_field *fields[] = { 1532 shadow_read_write_fields, 1533 shadow_read_only_fields 1534 }; 1535 const int max_fields[] = { 1536 max_shadow_read_write_fields, 1537 max_shadow_read_only_fields 1538 }; 1539 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1540 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1541 struct shadow_vmcs_field field; 1542 unsigned long val; 1543 int i, q; 1544 1545 if (WARN_ON(!shadow_vmcs)) 1546 return; 1547 1548 vmcs_load(shadow_vmcs); 1549 1550 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1551 for (i = 0; i < max_fields[q]; i++) { 1552 field = fields[q][i]; 1553 val = vmcs12_read_any(vmcs12, field.encoding, 1554 field.offset); 1555 __vmcs_writel(field.encoding, val); 1556 } 1557 } 1558 1559 vmcs_clear(shadow_vmcs); 1560 vmcs_load(vmx->loaded_vmcs->vmcs); 1561 } 1562 1563 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1564 { 1565 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1566 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1567 1568 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1569 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1570 vmcs12->guest_rip = evmcs->guest_rip; 1571 1572 if (unlikely(!(hv_clean_fields & 1573 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1574 vmcs12->guest_rsp = evmcs->guest_rsp; 1575 vmcs12->guest_rflags = evmcs->guest_rflags; 1576 vmcs12->guest_interruptibility_info = 1577 evmcs->guest_interruptibility_info; 1578 } 1579 1580 if (unlikely(!(hv_clean_fields & 1581 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1582 vmcs12->cpu_based_vm_exec_control = 1583 evmcs->cpu_based_vm_exec_control; 1584 } 1585 1586 if (unlikely(!(hv_clean_fields & 1587 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1588 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1589 } 1590 1591 if (unlikely(!(hv_clean_fields & 1592 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1593 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1594 } 1595 1596 if (unlikely(!(hv_clean_fields & 1597 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1598 vmcs12->vm_entry_intr_info_field = 1599 evmcs->vm_entry_intr_info_field; 1600 vmcs12->vm_entry_exception_error_code = 1601 evmcs->vm_entry_exception_error_code; 1602 vmcs12->vm_entry_instruction_len = 1603 evmcs->vm_entry_instruction_len; 1604 } 1605 1606 if (unlikely(!(hv_clean_fields & 1607 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1608 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1609 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1610 vmcs12->host_cr0 = evmcs->host_cr0; 1611 vmcs12->host_cr3 = evmcs->host_cr3; 1612 vmcs12->host_cr4 = evmcs->host_cr4; 1613 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1614 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1615 vmcs12->host_rip = evmcs->host_rip; 1616 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1617 vmcs12->host_es_selector = evmcs->host_es_selector; 1618 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1619 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1620 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1621 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1622 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1623 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1624 } 1625 1626 if (unlikely(!(hv_clean_fields & 1627 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1628 vmcs12->pin_based_vm_exec_control = 1629 evmcs->pin_based_vm_exec_control; 1630 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1631 vmcs12->secondary_vm_exec_control = 1632 evmcs->secondary_vm_exec_control; 1633 } 1634 1635 if (unlikely(!(hv_clean_fields & 1636 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1637 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1638 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1639 } 1640 1641 if (unlikely(!(hv_clean_fields & 1642 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1643 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1644 } 1645 1646 if (unlikely(!(hv_clean_fields & 1647 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1648 vmcs12->guest_es_base = evmcs->guest_es_base; 1649 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1650 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1651 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1652 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1653 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1654 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1655 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1656 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1657 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1658 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1659 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1660 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1661 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1662 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1663 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1664 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1665 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1666 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1667 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1668 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1669 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1670 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1671 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1672 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1673 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1674 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1675 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1676 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1677 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1678 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1679 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1680 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1681 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1682 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1683 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1684 } 1685 1686 if (unlikely(!(hv_clean_fields & 1687 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1688 vmcs12->tsc_offset = evmcs->tsc_offset; 1689 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1690 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1691 } 1692 1693 if (unlikely(!(hv_clean_fields & 1694 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1695 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1696 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1697 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1698 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1699 vmcs12->guest_cr0 = evmcs->guest_cr0; 1700 vmcs12->guest_cr3 = evmcs->guest_cr3; 1701 vmcs12->guest_cr4 = evmcs->guest_cr4; 1702 vmcs12->guest_dr7 = evmcs->guest_dr7; 1703 } 1704 1705 if (unlikely(!(hv_clean_fields & 1706 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1707 vmcs12->host_fs_base = evmcs->host_fs_base; 1708 vmcs12->host_gs_base = evmcs->host_gs_base; 1709 vmcs12->host_tr_base = evmcs->host_tr_base; 1710 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1711 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1712 vmcs12->host_rsp = evmcs->host_rsp; 1713 } 1714 1715 if (unlikely(!(hv_clean_fields & 1716 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1717 vmcs12->ept_pointer = evmcs->ept_pointer; 1718 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1719 } 1720 1721 if (unlikely(!(hv_clean_fields & 1722 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1723 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1724 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1725 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1726 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1727 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1728 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1729 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1730 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1731 vmcs12->guest_pending_dbg_exceptions = 1732 evmcs->guest_pending_dbg_exceptions; 1733 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1734 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1735 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1736 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1737 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1738 } 1739 1740 /* 1741 * Not used? 1742 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1743 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1744 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1745 * vmcs12->page_fault_error_code_mask = 1746 * evmcs->page_fault_error_code_mask; 1747 * vmcs12->page_fault_error_code_match = 1748 * evmcs->page_fault_error_code_match; 1749 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1750 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1751 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1752 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1753 */ 1754 1755 /* 1756 * Read only fields: 1757 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1758 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1759 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1760 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1761 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1762 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1763 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1764 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1765 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1766 * vmcs12->exit_qualification = evmcs->exit_qualification; 1767 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1768 * 1769 * Not present in struct vmcs12: 1770 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1771 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1772 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1773 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1774 */ 1775 1776 return; 1777 } 1778 1779 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1780 { 1781 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1782 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1783 1784 /* 1785 * Should not be changed by KVM: 1786 * 1787 * evmcs->host_es_selector = vmcs12->host_es_selector; 1788 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1789 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1790 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1791 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1792 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1793 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1794 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1795 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1796 * evmcs->host_cr0 = vmcs12->host_cr0; 1797 * evmcs->host_cr3 = vmcs12->host_cr3; 1798 * evmcs->host_cr4 = vmcs12->host_cr4; 1799 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1800 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1801 * evmcs->host_rip = vmcs12->host_rip; 1802 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1803 * evmcs->host_fs_base = vmcs12->host_fs_base; 1804 * evmcs->host_gs_base = vmcs12->host_gs_base; 1805 * evmcs->host_tr_base = vmcs12->host_tr_base; 1806 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1807 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1808 * evmcs->host_rsp = vmcs12->host_rsp; 1809 * sync_vmcs02_to_vmcs12() doesn't read these: 1810 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1811 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1812 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1813 * evmcs->ept_pointer = vmcs12->ept_pointer; 1814 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1815 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1816 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1817 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1818 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1819 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1820 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1821 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1822 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1823 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1824 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1825 * evmcs->page_fault_error_code_mask = 1826 * vmcs12->page_fault_error_code_mask; 1827 * evmcs->page_fault_error_code_match = 1828 * vmcs12->page_fault_error_code_match; 1829 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1830 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1831 * evmcs->tsc_offset = vmcs12->tsc_offset; 1832 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1833 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1834 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1835 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1836 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1837 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1838 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1839 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1840 * 1841 * Not present in struct vmcs12: 1842 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1843 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1844 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1845 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1846 */ 1847 1848 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1849 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1850 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1851 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1852 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1853 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1854 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1855 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1856 1857 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1858 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1859 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1860 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1861 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1862 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1863 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1864 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1865 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1866 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1867 1868 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1869 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1870 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1871 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1872 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1873 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1874 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1875 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1876 1877 evmcs->guest_es_base = vmcs12->guest_es_base; 1878 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1879 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1880 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1881 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1882 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1883 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1884 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1885 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1886 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1887 1888 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1889 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1890 1891 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1892 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1893 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1894 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1895 1896 evmcs->guest_pending_dbg_exceptions = 1897 vmcs12->guest_pending_dbg_exceptions; 1898 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1899 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1900 1901 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1902 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1903 1904 evmcs->guest_cr0 = vmcs12->guest_cr0; 1905 evmcs->guest_cr3 = vmcs12->guest_cr3; 1906 evmcs->guest_cr4 = vmcs12->guest_cr4; 1907 evmcs->guest_dr7 = vmcs12->guest_dr7; 1908 1909 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1910 1911 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1912 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1913 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1914 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1915 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1916 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1917 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1918 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1919 1920 evmcs->exit_qualification = vmcs12->exit_qualification; 1921 1922 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1923 evmcs->guest_rsp = vmcs12->guest_rsp; 1924 evmcs->guest_rflags = vmcs12->guest_rflags; 1925 1926 evmcs->guest_interruptibility_info = 1927 vmcs12->guest_interruptibility_info; 1928 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1929 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1930 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1931 evmcs->vm_entry_exception_error_code = 1932 vmcs12->vm_entry_exception_error_code; 1933 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1934 1935 evmcs->guest_rip = vmcs12->guest_rip; 1936 1937 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1938 1939 return; 1940 } 1941 1942 /* 1943 * This is an equivalent of the nested hypervisor executing the vmptrld 1944 * instruction. 1945 */ 1946 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1947 struct kvm_vcpu *vcpu, bool from_launch) 1948 { 1949 struct vcpu_vmx *vmx = to_vmx(vcpu); 1950 bool evmcs_gpa_changed = false; 1951 u64 evmcs_gpa; 1952 1953 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1954 return EVMPTRLD_DISABLED; 1955 1956 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { 1957 nested_release_evmcs(vcpu); 1958 return EVMPTRLD_DISABLED; 1959 } 1960 1961 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1962 vmx->nested.current_vmptr = INVALID_GPA; 1963 1964 nested_release_evmcs(vcpu); 1965 1966 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1967 &vmx->nested.hv_evmcs_map)) 1968 return EVMPTRLD_ERROR; 1969 1970 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1971 1972 /* 1973 * Currently, KVM only supports eVMCS version 1 1974 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 1975 * value to first u32 field of eVMCS which should specify eVMCS 1976 * VersionNumber. 1977 * 1978 * Guest should be aware of supported eVMCS versions by host by 1979 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 1980 * expected to set this CPUID leaf according to the value 1981 * returned in vmcs_version from nested_enable_evmcs(). 1982 * 1983 * However, it turns out that Microsoft Hyper-V fails to comply 1984 * to their own invented interface: When Hyper-V use eVMCS, it 1985 * just sets first u32 field of eVMCS to revision_id specified 1986 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 1987 * which is one of the supported versions specified in 1988 * CPUID.0x4000000A.EAX[0:15]. 1989 * 1990 * To overcome Hyper-V bug, we accept here either a supported 1991 * eVMCS version or VMCS12 revision_id as valid values for first 1992 * u32 field of eVMCS. 1993 */ 1994 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 1995 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 1996 nested_release_evmcs(vcpu); 1997 return EVMPTRLD_VMFAIL; 1998 } 1999 2000 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2001 2002 evmcs_gpa_changed = true; 2003 /* 2004 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2005 * reloaded from guest's memory (read only fields, fields not 2006 * present in struct hv_enlightened_vmcs, ...). Make sure there 2007 * are no leftovers. 2008 */ 2009 if (from_launch) { 2010 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2011 memset(vmcs12, 0, sizeof(*vmcs12)); 2012 vmcs12->hdr.revision_id = VMCS12_REVISION; 2013 } 2014 2015 } 2016 2017 /* 2018 * Clean fields data can't be used on VMLAUNCH and when we switch 2019 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2020 */ 2021 if (from_launch || evmcs_gpa_changed) 2022 vmx->nested.hv_evmcs->hv_clean_fields &= 2023 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2024 2025 return EVMPTRLD_SUCCEEDED; 2026 } 2027 2028 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2029 { 2030 struct vcpu_vmx *vmx = to_vmx(vcpu); 2031 2032 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2033 copy_vmcs12_to_enlightened(vmx); 2034 else 2035 copy_vmcs12_to_shadow(vmx); 2036 2037 vmx->nested.need_vmcs12_to_shadow_sync = false; 2038 } 2039 2040 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2041 { 2042 struct vcpu_vmx *vmx = 2043 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2044 2045 vmx->nested.preemption_timer_expired = true; 2046 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2047 kvm_vcpu_kick(&vmx->vcpu); 2048 2049 return HRTIMER_NORESTART; 2050 } 2051 2052 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2053 { 2054 struct vcpu_vmx *vmx = to_vmx(vcpu); 2055 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2056 2057 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2058 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2059 2060 if (!vmx->nested.has_preemption_timer_deadline) { 2061 vmx->nested.preemption_timer_deadline = 2062 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2063 vmx->nested.has_preemption_timer_deadline = true; 2064 } 2065 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2066 } 2067 2068 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2069 u64 preemption_timeout) 2070 { 2071 struct vcpu_vmx *vmx = to_vmx(vcpu); 2072 2073 /* 2074 * A timer value of zero is architecturally guaranteed to cause 2075 * a VMExit prior to executing any instructions in the guest. 2076 */ 2077 if (preemption_timeout == 0) { 2078 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2079 return; 2080 } 2081 2082 if (vcpu->arch.virtual_tsc_khz == 0) 2083 return; 2084 2085 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2086 preemption_timeout *= 1000000; 2087 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2088 hrtimer_start(&vmx->nested.preemption_timer, 2089 ktime_add_ns(ktime_get(), preemption_timeout), 2090 HRTIMER_MODE_ABS_PINNED); 2091 } 2092 2093 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2094 { 2095 if (vmx->nested.nested_run_pending && 2096 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2097 return vmcs12->guest_ia32_efer; 2098 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2099 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2100 else 2101 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2102 } 2103 2104 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2105 { 2106 /* 2107 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2108 * according to L0's settings (vmcs12 is irrelevant here). Host 2109 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2110 * will be set as needed prior to VMLAUNCH/VMRESUME. 2111 */ 2112 if (vmx->nested.vmcs02_initialized) 2113 return; 2114 vmx->nested.vmcs02_initialized = true; 2115 2116 /* 2117 * We don't care what the EPTP value is we just need to guarantee 2118 * it's valid so we don't get a false positive when doing early 2119 * consistency checks. 2120 */ 2121 if (enable_ept && nested_early_check) 2122 vmcs_write64(EPT_POINTER, 2123 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2124 2125 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2126 if (cpu_has_vmx_vmfunc()) 2127 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2128 2129 if (cpu_has_vmx_posted_intr()) 2130 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2131 2132 if (cpu_has_vmx_msr_bitmap()) 2133 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2134 2135 /* 2136 * PML is emulated for L2, but never enabled in hardware as the MMU 2137 * handles A/D emulation. Disabling PML for L2 also avoids having to 2138 * deal with filtering out L2 GPAs from the buffer. 2139 */ 2140 if (enable_pml) { 2141 vmcs_write64(PML_ADDRESS, 0); 2142 vmcs_write16(GUEST_PML_INDEX, -1); 2143 } 2144 2145 if (cpu_has_vmx_encls_vmexit()) 2146 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2147 2148 /* 2149 * Set the MSR load/store lists to match L0's settings. Only the 2150 * addresses are constant (for vmcs02), the counts can change based 2151 * on L2's behavior, e.g. switching to/from long mode. 2152 */ 2153 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2154 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2155 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2156 2157 vmx_set_constant_host_state(vmx); 2158 } 2159 2160 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2161 struct vmcs12 *vmcs12) 2162 { 2163 prepare_vmcs02_constant_state(vmx); 2164 2165 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2166 2167 if (enable_vpid) { 2168 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2169 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2170 else 2171 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2172 } 2173 } 2174 2175 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2176 struct vmcs12 *vmcs12) 2177 { 2178 u32 exec_control; 2179 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2180 2181 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2182 prepare_vmcs02_early_rare(vmx, vmcs12); 2183 2184 /* 2185 * PIN CONTROLS 2186 */ 2187 exec_control = __pin_controls_get(vmcs01); 2188 exec_control |= (vmcs12->pin_based_vm_exec_control & 2189 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2190 2191 /* Posted interrupts setting is only taken from vmcs12. */ 2192 vmx->nested.pi_pending = false; 2193 if (nested_cpu_has_posted_intr(vmcs12)) 2194 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2195 else 2196 exec_control &= ~PIN_BASED_POSTED_INTR; 2197 pin_controls_set(vmx, exec_control); 2198 2199 /* 2200 * EXEC CONTROLS 2201 */ 2202 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2203 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2204 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2205 exec_control &= ~CPU_BASED_TPR_SHADOW; 2206 exec_control |= vmcs12->cpu_based_vm_exec_control; 2207 2208 vmx->nested.l1_tpr_threshold = -1; 2209 if (exec_control & CPU_BASED_TPR_SHADOW) 2210 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2211 #ifdef CONFIG_X86_64 2212 else 2213 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2214 CPU_BASED_CR8_STORE_EXITING; 2215 #endif 2216 2217 /* 2218 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2219 * for I/O port accesses. 2220 */ 2221 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2222 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2223 2224 /* 2225 * This bit will be computed in nested_get_vmcs12_pages, because 2226 * we do not have access to L1's MSR bitmap yet. For now, keep 2227 * the same bit as before, hoping to avoid multiple VMWRITEs that 2228 * only set/clear this bit. 2229 */ 2230 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2231 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2232 2233 exec_controls_set(vmx, exec_control); 2234 2235 /* 2236 * SECONDARY EXEC CONTROLS 2237 */ 2238 if (cpu_has_secondary_exec_ctrls()) { 2239 exec_control = __secondary_exec_controls_get(vmcs01); 2240 2241 /* Take the following fields only from vmcs12 */ 2242 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2243 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2244 SECONDARY_EXEC_ENABLE_INVPCID | 2245 SECONDARY_EXEC_ENABLE_RDTSCP | 2246 SECONDARY_EXEC_XSAVES | 2247 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2248 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2249 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2250 SECONDARY_EXEC_ENABLE_VMFUNC | 2251 SECONDARY_EXEC_TSC_SCALING | 2252 SECONDARY_EXEC_DESC); 2253 2254 if (nested_cpu_has(vmcs12, 2255 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2256 exec_control |= vmcs12->secondary_vm_exec_control; 2257 2258 /* PML is emulated and never enabled in hardware for L2. */ 2259 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2260 2261 /* VMCS shadowing for L2 is emulated for now */ 2262 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2263 2264 /* 2265 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2266 * will not have to rewrite the controls just for this bit. 2267 */ 2268 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2269 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2270 exec_control |= SECONDARY_EXEC_DESC; 2271 2272 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2273 vmcs_write16(GUEST_INTR_STATUS, 2274 vmcs12->guest_intr_status); 2275 2276 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2277 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2278 2279 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2280 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2281 2282 secondary_exec_controls_set(vmx, exec_control); 2283 } 2284 2285 /* 2286 * ENTRY CONTROLS 2287 * 2288 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2289 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2290 * on the related bits (if supported by the CPU) in the hope that 2291 * we can avoid VMWrites during vmx_set_efer(). 2292 */ 2293 exec_control = __vm_entry_controls_get(vmcs01); 2294 exec_control |= vmcs12->vm_entry_controls; 2295 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2296 if (cpu_has_load_ia32_efer()) { 2297 if (guest_efer & EFER_LMA) 2298 exec_control |= VM_ENTRY_IA32E_MODE; 2299 if (guest_efer != host_efer) 2300 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2301 } 2302 vm_entry_controls_set(vmx, exec_control); 2303 2304 /* 2305 * EXIT CONTROLS 2306 * 2307 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2308 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2309 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2310 */ 2311 exec_control = __vm_exit_controls_get(vmcs01); 2312 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2313 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2314 else 2315 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2316 vm_exit_controls_set(vmx, exec_control); 2317 2318 /* 2319 * Interrupt/Exception Fields 2320 */ 2321 if (vmx->nested.nested_run_pending) { 2322 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2323 vmcs12->vm_entry_intr_info_field); 2324 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2325 vmcs12->vm_entry_exception_error_code); 2326 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2327 vmcs12->vm_entry_instruction_len); 2328 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2329 vmcs12->guest_interruptibility_info); 2330 vmx->loaded_vmcs->nmi_known_unmasked = 2331 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2332 } else { 2333 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2334 } 2335 } 2336 2337 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2338 { 2339 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2340 2341 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2342 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2343 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2344 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2345 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2346 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2347 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2348 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2349 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2350 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2351 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2352 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2353 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2354 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2355 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2356 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2357 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2358 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2359 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2360 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2361 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2362 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2363 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2364 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2365 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2366 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2367 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2368 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2369 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2370 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2371 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2372 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2373 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2374 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2375 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2376 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2377 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2378 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2379 2380 vmx->segment_cache.bitmask = 0; 2381 } 2382 2383 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2384 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2385 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2386 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2387 vmcs12->guest_pending_dbg_exceptions); 2388 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2389 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2390 2391 /* 2392 * L1 may access the L2's PDPTR, so save them to construct 2393 * vmcs12 2394 */ 2395 if (enable_ept) { 2396 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2397 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2398 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2399 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2400 } 2401 2402 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2403 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2404 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2405 } 2406 2407 if (nested_cpu_has_xsaves(vmcs12)) 2408 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2409 2410 /* 2411 * Whether page-faults are trapped is determined by a combination of 2412 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2413 * doesn't care about page faults then we should set all of these to 2414 * L1's desires. However, if L0 does care about (some) page faults, it 2415 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2416 * simply ask to exit on each and every L2 page fault. This is done by 2417 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2418 * Note that below we don't need special code to set EB.PF beyond the 2419 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2420 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2421 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2422 */ 2423 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2424 /* 2425 * TODO: if both L0 and L1 need the same MASK and MATCH, 2426 * go ahead and use it? 2427 */ 2428 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2429 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2430 } else { 2431 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2432 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2433 } 2434 2435 if (cpu_has_vmx_apicv()) { 2436 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2437 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2438 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2439 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2440 } 2441 2442 /* 2443 * Make sure the msr_autostore list is up to date before we set the 2444 * count in the vmcs02. 2445 */ 2446 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2447 2448 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2449 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2450 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2451 2452 set_cr4_guest_host_mask(vmx); 2453 } 2454 2455 /* 2456 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2457 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2458 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2459 * guest in a way that will both be appropriate to L1's requests, and our 2460 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2461 * function also has additional necessary side-effects, like setting various 2462 * vcpu->arch fields. 2463 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2464 * is assigned to entry_failure_code on failure. 2465 */ 2466 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2467 bool from_vmentry, 2468 enum vm_entry_failure_code *entry_failure_code) 2469 { 2470 struct vcpu_vmx *vmx = to_vmx(vcpu); 2471 bool load_guest_pdptrs_vmcs12 = false; 2472 2473 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 2474 prepare_vmcs02_rare(vmx, vmcs12); 2475 vmx->nested.dirty_vmcs12 = false; 2476 2477 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || 2478 !(vmx->nested.hv_evmcs->hv_clean_fields & 2479 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2480 } 2481 2482 if (vmx->nested.nested_run_pending && 2483 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2484 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2485 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2486 } else { 2487 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2488 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2489 } 2490 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2491 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2492 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2493 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2494 2495 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2496 * bitwise-or of what L1 wants to trap for L2, and what we want to 2497 * trap. Note that CR0.TS also needs updating - we do this later. 2498 */ 2499 vmx_update_exception_bitmap(vcpu); 2500 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2501 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2502 2503 if (vmx->nested.nested_run_pending && 2504 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2505 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2506 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2507 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2508 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2509 } 2510 2511 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2512 vcpu->arch.l1_tsc_offset, 2513 vmx_get_l2_tsc_offset(vcpu), 2514 vmx_get_l2_tsc_multiplier(vcpu)); 2515 2516 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2517 vcpu->arch.l1_tsc_scaling_ratio, 2518 vmx_get_l2_tsc_multiplier(vcpu)); 2519 2520 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2521 if (kvm_has_tsc_control) 2522 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2523 2524 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2525 2526 if (nested_cpu_has_ept(vmcs12)) 2527 nested_ept_init_mmu_context(vcpu); 2528 2529 /* 2530 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2531 * bits which we consider mandatory enabled. 2532 * The CR0_READ_SHADOW is what L2 should have expected to read given 2533 * the specifications by L1; It's not enough to take 2534 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2535 * have more bits than L1 expected. 2536 */ 2537 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2538 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2539 2540 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2541 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2542 2543 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2544 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2545 vmx_set_efer(vcpu, vcpu->arch.efer); 2546 2547 /* 2548 * Guest state is invalid and unrestricted guest is disabled, 2549 * which means L1 attempted VMEntry to L2 with invalid state. 2550 * Fail the VMEntry. 2551 * 2552 * However when force loading the guest state (SMM exit or 2553 * loading nested state after migration, it is possible to 2554 * have invalid guest state now, which will be later fixed by 2555 * restoring L2 register state 2556 */ 2557 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2558 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2559 return -EINVAL; 2560 } 2561 2562 /* Shadow page tables on either EPT or shadow page tables. */ 2563 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2564 from_vmentry, entry_failure_code)) 2565 return -EINVAL; 2566 2567 /* 2568 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2569 * on nested VM-Exit, which can occur without actually running L2 and 2570 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2571 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2572 * transition to HLT instead of running L2. 2573 */ 2574 if (enable_ept) 2575 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2576 2577 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2578 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2579 is_pae_paging(vcpu)) { 2580 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2581 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2582 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2583 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2584 } 2585 2586 if (!enable_ept) 2587 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2588 2589 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2590 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2591 vmcs12->guest_ia32_perf_global_ctrl))) 2592 return -EINVAL; 2593 2594 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2595 kvm_rip_write(vcpu, vmcs12->guest_rip); 2596 2597 /* 2598 * It was observed that genuine Hyper-V running in L1 doesn't reset 2599 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2600 * bits when it changes a field in eVMCS. Mark all fields as clean 2601 * here. 2602 */ 2603 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2604 vmx->nested.hv_evmcs->hv_clean_fields |= 2605 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2606 2607 return 0; 2608 } 2609 2610 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2611 { 2612 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2613 nested_cpu_has_virtual_nmis(vmcs12))) 2614 return -EINVAL; 2615 2616 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2617 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2618 return -EINVAL; 2619 2620 return 0; 2621 } 2622 2623 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2624 { 2625 struct vcpu_vmx *vmx = to_vmx(vcpu); 2626 2627 /* Check for memory type validity */ 2628 switch (new_eptp & VMX_EPTP_MT_MASK) { 2629 case VMX_EPTP_MT_UC: 2630 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2631 return false; 2632 break; 2633 case VMX_EPTP_MT_WB: 2634 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2635 return false; 2636 break; 2637 default: 2638 return false; 2639 } 2640 2641 /* Page-walk levels validity. */ 2642 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2643 case VMX_EPTP_PWL_5: 2644 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2645 return false; 2646 break; 2647 case VMX_EPTP_PWL_4: 2648 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2649 return false; 2650 break; 2651 default: 2652 return false; 2653 } 2654 2655 /* Reserved bits should not be set */ 2656 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2657 return false; 2658 2659 /* AD, if set, should be supported */ 2660 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2661 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2662 return false; 2663 } 2664 2665 return true; 2666 } 2667 2668 /* 2669 * Checks related to VM-Execution Control Fields 2670 */ 2671 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2672 struct vmcs12 *vmcs12) 2673 { 2674 struct vcpu_vmx *vmx = to_vmx(vcpu); 2675 2676 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2677 vmx->nested.msrs.pinbased_ctls_low, 2678 vmx->nested.msrs.pinbased_ctls_high)) || 2679 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2680 vmx->nested.msrs.procbased_ctls_low, 2681 vmx->nested.msrs.procbased_ctls_high))) 2682 return -EINVAL; 2683 2684 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2685 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2686 vmx->nested.msrs.secondary_ctls_low, 2687 vmx->nested.msrs.secondary_ctls_high))) 2688 return -EINVAL; 2689 2690 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2691 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2692 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2693 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2694 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2695 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2696 nested_vmx_check_nmi_controls(vmcs12) || 2697 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2698 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2699 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2700 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2701 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2702 return -EINVAL; 2703 2704 if (!nested_cpu_has_preemption_timer(vmcs12) && 2705 nested_cpu_has_save_preemption_timer(vmcs12)) 2706 return -EINVAL; 2707 2708 if (nested_cpu_has_ept(vmcs12) && 2709 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2710 return -EINVAL; 2711 2712 if (nested_cpu_has_vmfunc(vmcs12)) { 2713 if (CC(vmcs12->vm_function_control & 2714 ~vmx->nested.msrs.vmfunc_controls)) 2715 return -EINVAL; 2716 2717 if (nested_cpu_has_eptp_switching(vmcs12)) { 2718 if (CC(!nested_cpu_has_ept(vmcs12)) || 2719 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2720 return -EINVAL; 2721 } 2722 } 2723 2724 return 0; 2725 } 2726 2727 /* 2728 * Checks related to VM-Exit Control Fields 2729 */ 2730 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2731 struct vmcs12 *vmcs12) 2732 { 2733 struct vcpu_vmx *vmx = to_vmx(vcpu); 2734 2735 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2736 vmx->nested.msrs.exit_ctls_low, 2737 vmx->nested.msrs.exit_ctls_high)) || 2738 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2739 return -EINVAL; 2740 2741 return 0; 2742 } 2743 2744 /* 2745 * Checks related to VM-Entry Control Fields 2746 */ 2747 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2748 struct vmcs12 *vmcs12) 2749 { 2750 struct vcpu_vmx *vmx = to_vmx(vcpu); 2751 2752 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2753 vmx->nested.msrs.entry_ctls_low, 2754 vmx->nested.msrs.entry_ctls_high))) 2755 return -EINVAL; 2756 2757 /* 2758 * From the Intel SDM, volume 3: 2759 * Fields relevant to VM-entry event injection must be set properly. 2760 * These fields are the VM-entry interruption-information field, the 2761 * VM-entry exception error code, and the VM-entry instruction length. 2762 */ 2763 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2764 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2765 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2766 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2767 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2768 bool should_have_error_code; 2769 bool urg = nested_cpu_has2(vmcs12, 2770 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2771 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2772 2773 /* VM-entry interruption-info field: interruption type */ 2774 if (CC(intr_type == INTR_TYPE_RESERVED) || 2775 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2776 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2777 return -EINVAL; 2778 2779 /* VM-entry interruption-info field: vector */ 2780 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2781 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2782 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2783 return -EINVAL; 2784 2785 /* VM-entry interruption-info field: deliver error code */ 2786 should_have_error_code = 2787 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2788 x86_exception_has_error_code(vector); 2789 if (CC(has_error_code != should_have_error_code)) 2790 return -EINVAL; 2791 2792 /* VM-entry exception error code */ 2793 if (CC(has_error_code && 2794 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2795 return -EINVAL; 2796 2797 /* VM-entry interruption-info field: reserved bits */ 2798 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2799 return -EINVAL; 2800 2801 /* VM-entry instruction length */ 2802 switch (intr_type) { 2803 case INTR_TYPE_SOFT_EXCEPTION: 2804 case INTR_TYPE_SOFT_INTR: 2805 case INTR_TYPE_PRIV_SW_EXCEPTION: 2806 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2807 CC(vmcs12->vm_entry_instruction_len == 0 && 2808 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2809 return -EINVAL; 2810 } 2811 } 2812 2813 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2814 return -EINVAL; 2815 2816 return 0; 2817 } 2818 2819 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2820 struct vmcs12 *vmcs12) 2821 { 2822 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2823 nested_check_vm_exit_controls(vcpu, vmcs12) || 2824 nested_check_vm_entry_controls(vcpu, vmcs12)) 2825 return -EINVAL; 2826 2827 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) 2828 return nested_evmcs_check_controls(vmcs12); 2829 2830 return 0; 2831 } 2832 2833 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2834 struct vmcs12 *vmcs12) 2835 { 2836 bool ia32e; 2837 2838 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2839 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2840 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2841 return -EINVAL; 2842 2843 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2844 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2845 return -EINVAL; 2846 2847 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2848 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2849 return -EINVAL; 2850 2851 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2852 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2853 vmcs12->host_ia32_perf_global_ctrl))) 2854 return -EINVAL; 2855 2856 #ifdef CONFIG_X86_64 2857 ia32e = !!(vcpu->arch.efer & EFER_LMA); 2858 #else 2859 ia32e = false; 2860 #endif 2861 2862 if (ia32e) { 2863 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || 2864 CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2865 return -EINVAL; 2866 } else { 2867 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || 2868 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2869 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2870 CC((vmcs12->host_rip) >> 32)) 2871 return -EINVAL; 2872 } 2873 2874 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2875 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2876 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2877 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2878 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2879 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2880 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2881 CC(vmcs12->host_cs_selector == 0) || 2882 CC(vmcs12->host_tr_selector == 0) || 2883 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2884 return -EINVAL; 2885 2886 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2887 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2888 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2889 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2890 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2891 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2892 return -EINVAL; 2893 2894 /* 2895 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2896 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2897 * the values of the LMA and LME bits in the field must each be that of 2898 * the host address-space size VM-exit control. 2899 */ 2900 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2901 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2902 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2903 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2904 return -EINVAL; 2905 } 2906 2907 return 0; 2908 } 2909 2910 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2911 struct vmcs12 *vmcs12) 2912 { 2913 int r = 0; 2914 struct vmcs12 *shadow; 2915 struct kvm_host_map map; 2916 2917 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 2918 return 0; 2919 2920 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2921 return -EINVAL; 2922 2923 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2924 return -EINVAL; 2925 2926 shadow = map.hva; 2927 2928 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2929 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2930 r = -EINVAL; 2931 2932 kvm_vcpu_unmap(vcpu, &map, false); 2933 return r; 2934 } 2935 2936 /* 2937 * Checks related to Guest Non-register State 2938 */ 2939 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2940 { 2941 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2942 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 2943 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 2944 return -EINVAL; 2945 2946 return 0; 2947 } 2948 2949 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2950 struct vmcs12 *vmcs12, 2951 enum vm_entry_failure_code *entry_failure_code) 2952 { 2953 bool ia32e; 2954 2955 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2956 2957 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2958 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2959 return -EINVAL; 2960 2961 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 2962 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 2963 return -EINVAL; 2964 2965 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2966 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2967 return -EINVAL; 2968 2969 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2970 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 2971 return -EINVAL; 2972 } 2973 2974 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2975 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2976 vmcs12->guest_ia32_perf_global_ctrl))) 2977 return -EINVAL; 2978 2979 /* 2980 * If the load IA32_EFER VM-entry control is 1, the following checks 2981 * are performed on the field for the IA32_EFER MSR: 2982 * - Bits reserved in the IA32_EFER MSR must be 0. 2983 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2984 * the IA-32e mode guest VM-exit control. It must also be identical 2985 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2986 * CR0.PG) is 1. 2987 */ 2988 if (to_vmx(vcpu)->nested.nested_run_pending && 2989 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2990 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2991 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2992 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2993 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2994 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2995 return -EINVAL; 2996 } 2997 2998 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2999 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3000 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3001 return -EINVAL; 3002 3003 if (nested_check_guest_non_reg_state(vmcs12)) 3004 return -EINVAL; 3005 3006 return 0; 3007 } 3008 3009 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3010 { 3011 struct vcpu_vmx *vmx = to_vmx(vcpu); 3012 unsigned long cr3, cr4; 3013 bool vm_fail; 3014 3015 if (!nested_early_check) 3016 return 0; 3017 3018 if (vmx->msr_autoload.host.nr) 3019 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3020 if (vmx->msr_autoload.guest.nr) 3021 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3022 3023 preempt_disable(); 3024 3025 vmx_prepare_switch_to_guest(vcpu); 3026 3027 /* 3028 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3029 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3030 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3031 * there is no need to preserve other bits or save/restore the field. 3032 */ 3033 vmcs_writel(GUEST_RFLAGS, 0); 3034 3035 cr3 = __get_current_cr3_fast(); 3036 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3037 vmcs_writel(HOST_CR3, cr3); 3038 vmx->loaded_vmcs->host_state.cr3 = cr3; 3039 } 3040 3041 cr4 = cr4_read_shadow(); 3042 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3043 vmcs_writel(HOST_CR4, cr4); 3044 vmx->loaded_vmcs->host_state.cr4 = cr4; 3045 } 3046 3047 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3048 vmx->loaded_vmcs->launched); 3049 3050 if (vmx->msr_autoload.host.nr) 3051 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3052 if (vmx->msr_autoload.guest.nr) 3053 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3054 3055 if (vm_fail) { 3056 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3057 3058 preempt_enable(); 3059 3060 trace_kvm_nested_vmenter_failed( 3061 "early hardware check VM-instruction error: ", error); 3062 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3063 return 1; 3064 } 3065 3066 /* 3067 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3068 */ 3069 if (hw_breakpoint_active()) 3070 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3071 local_irq_enable(); 3072 preempt_enable(); 3073 3074 /* 3075 * A non-failing VMEntry means we somehow entered guest mode with 3076 * an illegal RIP, and that's just the tip of the iceberg. There 3077 * is no telling what memory has been modified or what state has 3078 * been exposed to unknown code. Hitting this all but guarantees 3079 * a (very critical) hardware issue. 3080 */ 3081 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3082 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3083 3084 return 0; 3085 } 3086 3087 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3088 { 3089 struct vcpu_vmx *vmx = to_vmx(vcpu); 3090 3091 /* 3092 * hv_evmcs may end up being not mapped after migration (when 3093 * L2 was running), map it here to make sure vmcs12 changes are 3094 * properly reflected. 3095 */ 3096 if (vmx->nested.enlightened_vmcs_enabled && 3097 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3098 enum nested_evmptrld_status evmptrld_status = 3099 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3100 3101 if (evmptrld_status == EVMPTRLD_VMFAIL || 3102 evmptrld_status == EVMPTRLD_ERROR) 3103 return false; 3104 3105 /* 3106 * Post migration VMCS12 always provides the most actual 3107 * information, copy it to eVMCS upon entry. 3108 */ 3109 vmx->nested.need_vmcs12_to_shadow_sync = true; 3110 } 3111 3112 return true; 3113 } 3114 3115 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3116 { 3117 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3118 struct vcpu_vmx *vmx = to_vmx(vcpu); 3119 struct kvm_host_map *map; 3120 struct page *page; 3121 u64 hpa; 3122 3123 if (!vcpu->arch.pdptrs_from_userspace && 3124 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3125 /* 3126 * Reload the guest's PDPTRs since after a migration 3127 * the guest CR3 might be restored prior to setting the nested 3128 * state which can lead to a load of wrong PDPTRs. 3129 */ 3130 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) 3131 return false; 3132 } 3133 3134 3135 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3136 /* 3137 * Translate L1 physical address to host physical 3138 * address for vmcs02. Keep the page pinned, so this 3139 * physical address remains valid. We keep a reference 3140 * to it so we can release it later. 3141 */ 3142 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 3143 kvm_release_page_clean(vmx->nested.apic_access_page); 3144 vmx->nested.apic_access_page = NULL; 3145 } 3146 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 3147 if (!is_error_page(page)) { 3148 vmx->nested.apic_access_page = page; 3149 hpa = page_to_phys(vmx->nested.apic_access_page); 3150 vmcs_write64(APIC_ACCESS_ADDR, hpa); 3151 } else { 3152 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 3153 __func__); 3154 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3155 vcpu->run->internal.suberror = 3156 KVM_INTERNAL_ERROR_EMULATION; 3157 vcpu->run->internal.ndata = 0; 3158 return false; 3159 } 3160 } 3161 3162 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3163 map = &vmx->nested.virtual_apic_map; 3164 3165 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3166 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3167 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3168 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3169 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3170 /* 3171 * The processor will never use the TPR shadow, simply 3172 * clear the bit from the execution control. Such a 3173 * configuration is useless, but it happens in tests. 3174 * For any other configuration, failing the vm entry is 3175 * _not_ what the processor does but it's basically the 3176 * only possibility we have. 3177 */ 3178 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3179 } else { 3180 /* 3181 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3182 * force VM-Entry to fail. 3183 */ 3184 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3185 } 3186 } 3187 3188 if (nested_cpu_has_posted_intr(vmcs12)) { 3189 map = &vmx->nested.pi_desc_map; 3190 3191 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3192 vmx->nested.pi_desc = 3193 (struct pi_desc *)(((void *)map->hva) + 3194 offset_in_page(vmcs12->posted_intr_desc_addr)); 3195 vmcs_write64(POSTED_INTR_DESC_ADDR, 3196 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3197 } else { 3198 /* 3199 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3200 * access the contents of the VMCS12 posted interrupt 3201 * descriptor. (Note that KVM may do this when it 3202 * should not, per the architectural specification.) 3203 */ 3204 vmx->nested.pi_desc = NULL; 3205 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3206 } 3207 } 3208 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3209 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3210 else 3211 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3212 3213 return true; 3214 } 3215 3216 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3217 { 3218 if (!nested_get_evmcs_page(vcpu)) { 3219 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3220 __func__); 3221 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3222 vcpu->run->internal.suberror = 3223 KVM_INTERNAL_ERROR_EMULATION; 3224 vcpu->run->internal.ndata = 0; 3225 3226 return false; 3227 } 3228 3229 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3230 return false; 3231 3232 return true; 3233 } 3234 3235 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3236 { 3237 struct vmcs12 *vmcs12; 3238 struct vcpu_vmx *vmx = to_vmx(vcpu); 3239 gpa_t dst; 3240 3241 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3242 return 0; 3243 3244 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3245 return 1; 3246 3247 /* 3248 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3249 * set is already checked as part of A/D emulation. 3250 */ 3251 vmcs12 = get_vmcs12(vcpu); 3252 if (!nested_cpu_has_pml(vmcs12)) 3253 return 0; 3254 3255 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3256 vmx->nested.pml_full = true; 3257 return 1; 3258 } 3259 3260 gpa &= ~0xFFFull; 3261 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3262 3263 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3264 offset_in_page(dst), sizeof(gpa))) 3265 return 0; 3266 3267 vmcs12->guest_pml_index--; 3268 3269 return 0; 3270 } 3271 3272 /* 3273 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3274 * for running VMX instructions (except VMXON, whose prerequisites are 3275 * slightly different). It also specifies what exception to inject otherwise. 3276 * Note that many of these exceptions have priority over VM exits, so they 3277 * don't have to be checked again here. 3278 */ 3279 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3280 { 3281 if (!to_vmx(vcpu)->nested.vmxon) { 3282 kvm_queue_exception(vcpu, UD_VECTOR); 3283 return 0; 3284 } 3285 3286 if (vmx_get_cpl(vcpu)) { 3287 kvm_inject_gp(vcpu, 0); 3288 return 0; 3289 } 3290 3291 return 1; 3292 } 3293 3294 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3295 { 3296 u8 rvi = vmx_get_rvi(); 3297 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3298 3299 return ((rvi & 0xf0) > (vppr & 0xf0)); 3300 } 3301 3302 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3303 struct vmcs12 *vmcs12); 3304 3305 /* 3306 * If from_vmentry is false, this is being called from state restore (either RSM 3307 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3308 * 3309 * Returns: 3310 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3311 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3312 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3313 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3314 */ 3315 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3316 bool from_vmentry) 3317 { 3318 struct vcpu_vmx *vmx = to_vmx(vcpu); 3319 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3320 enum vm_entry_failure_code entry_failure_code; 3321 bool evaluate_pending_interrupts; 3322 union vmx_exit_reason exit_reason = { 3323 .basic = EXIT_REASON_INVALID_STATE, 3324 .failed_vmentry = 1, 3325 }; 3326 u32 failed_index; 3327 3328 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 3329 kvm_vcpu_flush_tlb_current(vcpu); 3330 3331 evaluate_pending_interrupts = exec_controls_get(vmx) & 3332 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3333 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3334 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3335 3336 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3337 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3338 if (kvm_mpx_supported() && 3339 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3340 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3341 3342 /* 3343 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3344 * nested early checks are disabled. In the event of a "late" VM-Fail, 3345 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3346 * software model to the pre-VMEntry host state. When EPT is disabled, 3347 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3348 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3349 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3350 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3351 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3352 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3353 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3354 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3355 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3356 * path would need to manually save/restore vmcs01.GUEST_CR3. 3357 */ 3358 if (!enable_ept && !nested_early_check) 3359 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3360 3361 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3362 3363 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3364 3365 if (from_vmentry) { 3366 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3367 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3368 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3369 } 3370 3371 if (nested_vmx_check_vmentry_hw(vcpu)) { 3372 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3373 return NVMX_VMENTRY_VMFAIL; 3374 } 3375 3376 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3377 &entry_failure_code)) { 3378 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3379 vmcs12->exit_qualification = entry_failure_code; 3380 goto vmentry_fail_vmexit; 3381 } 3382 } 3383 3384 enter_guest_mode(vcpu); 3385 3386 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3387 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3388 vmcs12->exit_qualification = entry_failure_code; 3389 goto vmentry_fail_vmexit_guest_mode; 3390 } 3391 3392 if (from_vmentry) { 3393 failed_index = nested_vmx_load_msr(vcpu, 3394 vmcs12->vm_entry_msr_load_addr, 3395 vmcs12->vm_entry_msr_load_count); 3396 if (failed_index) { 3397 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3398 vmcs12->exit_qualification = failed_index; 3399 goto vmentry_fail_vmexit_guest_mode; 3400 } 3401 } else { 3402 /* 3403 * The MMU is not initialized to point at the right entities yet and 3404 * "get pages" would need to read data from the guest (i.e. we will 3405 * need to perform gpa to hpa translation). Request a call 3406 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3407 * have already been set at vmentry time and should not be reset. 3408 */ 3409 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3410 } 3411 3412 /* 3413 * If L1 had a pending IRQ/NMI until it executed 3414 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3415 * disallowed (e.g. interrupts disabled), L0 needs to 3416 * evaluate if this pending event should cause an exit from L2 3417 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3418 * intercept EXTERNAL_INTERRUPT). 3419 * 3420 * Usually this would be handled by the processor noticing an 3421 * IRQ/NMI window request, or checking RVI during evaluation of 3422 * pending virtual interrupts. However, this setting was done 3423 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3424 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3425 */ 3426 if (unlikely(evaluate_pending_interrupts)) 3427 kvm_make_request(KVM_REQ_EVENT, vcpu); 3428 3429 /* 3430 * Do not start the preemption timer hrtimer until after we know 3431 * we are successful, so that only nested_vmx_vmexit needs to cancel 3432 * the timer. 3433 */ 3434 vmx->nested.preemption_timer_expired = false; 3435 if (nested_cpu_has_preemption_timer(vmcs12)) { 3436 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3437 vmx_start_preemption_timer(vcpu, timer_value); 3438 } 3439 3440 /* 3441 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3442 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3443 * returned as far as L1 is concerned. It will only return (and set 3444 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3445 */ 3446 return NVMX_VMENTRY_SUCCESS; 3447 3448 /* 3449 * A failed consistency check that leads to a VMExit during L1's 3450 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3451 * 26.7 "VM-entry failures during or after loading guest state". 3452 */ 3453 vmentry_fail_vmexit_guest_mode: 3454 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3455 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3456 leave_guest_mode(vcpu); 3457 3458 vmentry_fail_vmexit: 3459 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3460 3461 if (!from_vmentry) 3462 return NVMX_VMENTRY_VMEXIT; 3463 3464 load_vmcs12_host_state(vcpu, vmcs12); 3465 vmcs12->vm_exit_reason = exit_reason.full; 3466 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 3467 vmx->nested.need_vmcs12_to_shadow_sync = true; 3468 return NVMX_VMENTRY_VMEXIT; 3469 } 3470 3471 /* 3472 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3473 * for running an L2 nested guest. 3474 */ 3475 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3476 { 3477 struct vmcs12 *vmcs12; 3478 enum nvmx_vmentry_status status; 3479 struct vcpu_vmx *vmx = to_vmx(vcpu); 3480 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3481 enum nested_evmptrld_status evmptrld_status; 3482 3483 if (!nested_vmx_check_permission(vcpu)) 3484 return 1; 3485 3486 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3487 if (evmptrld_status == EVMPTRLD_ERROR) { 3488 kvm_queue_exception(vcpu, UD_VECTOR); 3489 return 1; 3490 } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { 3491 return nested_vmx_failInvalid(vcpu); 3492 } 3493 3494 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && 3495 vmx->nested.current_vmptr == INVALID_GPA)) 3496 return nested_vmx_failInvalid(vcpu); 3497 3498 vmcs12 = get_vmcs12(vcpu); 3499 3500 /* 3501 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3502 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3503 * rather than RFLAGS.ZF, and no error number is stored to the 3504 * VM-instruction error field. 3505 */ 3506 if (CC(vmcs12->hdr.shadow_vmcs)) 3507 return nested_vmx_failInvalid(vcpu); 3508 3509 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 3510 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); 3511 /* Enlightened VMCS doesn't have launch state */ 3512 vmcs12->launch_state = !launch; 3513 } else if (enable_shadow_vmcs) { 3514 copy_shadow_to_vmcs12(vmx); 3515 } 3516 3517 /* 3518 * The nested entry process starts with enforcing various prerequisites 3519 * on vmcs12 as required by the Intel SDM, and act appropriately when 3520 * they fail: As the SDM explains, some conditions should cause the 3521 * instruction to fail, while others will cause the instruction to seem 3522 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3523 * To speed up the normal (success) code path, we should avoid checking 3524 * for misconfigurations which will anyway be caught by the processor 3525 * when using the merged vmcs02. 3526 */ 3527 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3528 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3529 3530 if (CC(vmcs12->launch_state == launch)) 3531 return nested_vmx_fail(vcpu, 3532 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3533 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3534 3535 if (nested_vmx_check_controls(vcpu, vmcs12)) 3536 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3537 3538 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3539 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3540 3541 /* 3542 * We're finally done with prerequisite checking, and can start with 3543 * the nested entry. 3544 */ 3545 vmx->nested.nested_run_pending = 1; 3546 vmx->nested.has_preemption_timer_deadline = false; 3547 status = nested_vmx_enter_non_root_mode(vcpu, true); 3548 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3549 goto vmentry_failed; 3550 3551 /* Emulate processing of posted interrupts on VM-Enter. */ 3552 if (nested_cpu_has_posted_intr(vmcs12) && 3553 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3554 vmx->nested.pi_pending = true; 3555 kvm_make_request(KVM_REQ_EVENT, vcpu); 3556 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3557 } 3558 3559 /* Hide L1D cache contents from the nested guest. */ 3560 vmx->vcpu.arch.l1tf_flush_l1d = true; 3561 3562 /* 3563 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3564 * also be used as part of restoring nVMX state for 3565 * snapshot restore (migration). 3566 * 3567 * In this flow, it is assumed that vmcs12 cache was 3568 * transferred as part of captured nVMX state and should 3569 * therefore not be read from guest memory (which may not 3570 * exist on destination host yet). 3571 */ 3572 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3573 3574 switch (vmcs12->guest_activity_state) { 3575 case GUEST_ACTIVITY_HLT: 3576 /* 3577 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3578 * awakened by event injection or by an NMI-window VM-exit or 3579 * by an interrupt-window VM-exit, halt the vcpu. 3580 */ 3581 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3582 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3583 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3584 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3585 vmx->nested.nested_run_pending = 0; 3586 return kvm_vcpu_halt(vcpu); 3587 } 3588 break; 3589 case GUEST_ACTIVITY_WAIT_SIPI: 3590 vmx->nested.nested_run_pending = 0; 3591 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3592 break; 3593 default: 3594 break; 3595 } 3596 3597 return 1; 3598 3599 vmentry_failed: 3600 vmx->nested.nested_run_pending = 0; 3601 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3602 return 0; 3603 if (status == NVMX_VMENTRY_VMEXIT) 3604 return 1; 3605 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3606 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3607 } 3608 3609 /* 3610 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3611 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3612 * This function returns the new value we should put in vmcs12.guest_cr0. 3613 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3614 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3615 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3616 * didn't trap the bit, because if L1 did, so would L0). 3617 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3618 * been modified by L2, and L1 knows it. So just leave the old value of 3619 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3620 * isn't relevant, because if L0 traps this bit it can set it to anything. 3621 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3622 * changed these bits, and therefore they need to be updated, but L0 3623 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3624 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3625 */ 3626 static inline unsigned long 3627 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3628 { 3629 return 3630 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3631 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3632 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3633 vcpu->arch.cr0_guest_owned_bits)); 3634 } 3635 3636 static inline unsigned long 3637 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3638 { 3639 return 3640 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3641 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3642 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3643 vcpu->arch.cr4_guest_owned_bits)); 3644 } 3645 3646 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3647 struct vmcs12 *vmcs12) 3648 { 3649 u32 idt_vectoring; 3650 unsigned int nr; 3651 3652 if (vcpu->arch.exception.injected) { 3653 nr = vcpu->arch.exception.nr; 3654 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3655 3656 if (kvm_exception_is_soft(nr)) { 3657 vmcs12->vm_exit_instruction_len = 3658 vcpu->arch.event_exit_inst_len; 3659 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3660 } else 3661 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3662 3663 if (vcpu->arch.exception.has_error_code) { 3664 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3665 vmcs12->idt_vectoring_error_code = 3666 vcpu->arch.exception.error_code; 3667 } 3668 3669 vmcs12->idt_vectoring_info_field = idt_vectoring; 3670 } else if (vcpu->arch.nmi_injected) { 3671 vmcs12->idt_vectoring_info_field = 3672 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3673 } else if (vcpu->arch.interrupt.injected) { 3674 nr = vcpu->arch.interrupt.nr; 3675 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3676 3677 if (vcpu->arch.interrupt.soft) { 3678 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3679 vmcs12->vm_entry_instruction_len = 3680 vcpu->arch.event_exit_inst_len; 3681 } else 3682 idt_vectoring |= INTR_TYPE_EXT_INTR; 3683 3684 vmcs12->idt_vectoring_info_field = idt_vectoring; 3685 } 3686 } 3687 3688 3689 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3690 { 3691 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3692 gfn_t gfn; 3693 3694 /* 3695 * Don't need to mark the APIC access page dirty; it is never 3696 * written to by the CPU during APIC virtualization. 3697 */ 3698 3699 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3700 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3701 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3702 } 3703 3704 if (nested_cpu_has_posted_intr(vmcs12)) { 3705 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3706 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3707 } 3708 } 3709 3710 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3711 { 3712 struct vcpu_vmx *vmx = to_vmx(vcpu); 3713 int max_irr; 3714 void *vapic_page; 3715 u16 status; 3716 3717 if (!vmx->nested.pi_pending) 3718 return 0; 3719 3720 if (!vmx->nested.pi_desc) 3721 goto mmio_needed; 3722 3723 vmx->nested.pi_pending = false; 3724 3725 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3726 return 0; 3727 3728 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3729 if (max_irr != 256) { 3730 vapic_page = vmx->nested.virtual_apic_map.hva; 3731 if (!vapic_page) 3732 goto mmio_needed; 3733 3734 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3735 vapic_page, &max_irr); 3736 status = vmcs_read16(GUEST_INTR_STATUS); 3737 if ((u8)max_irr > ((u8)status & 0xff)) { 3738 status &= ~0xff; 3739 status |= (u8)max_irr; 3740 vmcs_write16(GUEST_INTR_STATUS, status); 3741 } 3742 } 3743 3744 nested_mark_vmcs12_pages_dirty(vcpu); 3745 return 0; 3746 3747 mmio_needed: 3748 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3749 return -ENXIO; 3750 } 3751 3752 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3753 unsigned long exit_qual) 3754 { 3755 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3756 unsigned int nr = vcpu->arch.exception.nr; 3757 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3758 3759 if (vcpu->arch.exception.has_error_code) { 3760 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3761 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3762 } 3763 3764 if (kvm_exception_is_soft(nr)) 3765 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3766 else 3767 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3768 3769 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3770 vmx_get_nmi_mask(vcpu)) 3771 intr_info |= INTR_INFO_UNBLOCK_NMI; 3772 3773 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3774 } 3775 3776 /* 3777 * Returns true if a debug trap is pending delivery. 3778 * 3779 * In KVM, debug traps bear an exception payload. As such, the class of a #DB 3780 * exception may be inferred from the presence of an exception payload. 3781 */ 3782 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) 3783 { 3784 return vcpu->arch.exception.pending && 3785 vcpu->arch.exception.nr == DB_VECTOR && 3786 vcpu->arch.exception.payload; 3787 } 3788 3789 /* 3790 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3791 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3792 * represents these debug traps with a payload that is said to be compatible 3793 * with the 'pending debug exceptions' field, write the payload to the VMCS 3794 * field if a VM-exit is delivered before the debug trap. 3795 */ 3796 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3797 { 3798 if (vmx_pending_dbg_trap(vcpu)) 3799 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 3800 vcpu->arch.exception.payload); 3801 } 3802 3803 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3804 { 3805 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3806 to_vmx(vcpu)->nested.preemption_timer_expired; 3807 } 3808 3809 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 3810 { 3811 struct vcpu_vmx *vmx = to_vmx(vcpu); 3812 unsigned long exit_qual; 3813 bool block_nested_events = 3814 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3815 bool mtf_pending = vmx->nested.mtf_pending; 3816 struct kvm_lapic *apic = vcpu->arch.apic; 3817 3818 /* 3819 * Clear the MTF state. If a higher priority VM-exit is delivered first, 3820 * this state is discarded. 3821 */ 3822 if (!block_nested_events) 3823 vmx->nested.mtf_pending = false; 3824 3825 if (lapic_in_kernel(vcpu) && 3826 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3827 if (block_nested_events) 3828 return -EBUSY; 3829 nested_vmx_update_pending_dbg(vcpu); 3830 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3831 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 3832 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3833 return 0; 3834 } 3835 3836 if (lapic_in_kernel(vcpu) && 3837 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 3838 if (block_nested_events) 3839 return -EBUSY; 3840 3841 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 3842 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 3843 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 3844 apic->sipi_vector & 0xFFUL); 3845 return 0; 3846 } 3847 3848 /* 3849 * Process any exceptions that are not debug traps before MTF. 3850 * 3851 * Note that only a pending nested run can block a pending exception. 3852 * Otherwise an injected NMI/interrupt should either be 3853 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, 3854 * while delivering the pending exception. 3855 */ 3856 3857 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { 3858 if (vmx->nested.nested_run_pending) 3859 return -EBUSY; 3860 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3861 goto no_vmexit; 3862 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3863 return 0; 3864 } 3865 3866 if (mtf_pending) { 3867 if (block_nested_events) 3868 return -EBUSY; 3869 nested_vmx_update_pending_dbg(vcpu); 3870 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 3871 return 0; 3872 } 3873 3874 if (vcpu->arch.exception.pending) { 3875 if (vmx->nested.nested_run_pending) 3876 return -EBUSY; 3877 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3878 goto no_vmexit; 3879 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3880 return 0; 3881 } 3882 3883 if (nested_vmx_preemption_timer_pending(vcpu)) { 3884 if (block_nested_events) 3885 return -EBUSY; 3886 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3887 return 0; 3888 } 3889 3890 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 3891 if (block_nested_events) 3892 return -EBUSY; 3893 goto no_vmexit; 3894 } 3895 3896 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 3897 if (block_nested_events) 3898 return -EBUSY; 3899 if (!nested_exit_on_nmi(vcpu)) 3900 goto no_vmexit; 3901 3902 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3903 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3904 INTR_INFO_VALID_MASK, 0); 3905 /* 3906 * The NMI-triggered VM exit counts as injection: 3907 * clear this one and block further NMIs. 3908 */ 3909 vcpu->arch.nmi_pending = 0; 3910 vmx_set_nmi_mask(vcpu, true); 3911 return 0; 3912 } 3913 3914 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 3915 if (block_nested_events) 3916 return -EBUSY; 3917 if (!nested_exit_on_intr(vcpu)) 3918 goto no_vmexit; 3919 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3920 return 0; 3921 } 3922 3923 no_vmexit: 3924 return vmx_complete_nested_posted_interrupt(vcpu); 3925 } 3926 3927 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3928 { 3929 ktime_t remaining = 3930 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3931 u64 value; 3932 3933 if (ktime_to_ns(remaining) <= 0) 3934 return 0; 3935 3936 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3937 do_div(value, 1000000); 3938 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3939 } 3940 3941 static bool is_vmcs12_ext_field(unsigned long field) 3942 { 3943 switch (field) { 3944 case GUEST_ES_SELECTOR: 3945 case GUEST_CS_SELECTOR: 3946 case GUEST_SS_SELECTOR: 3947 case GUEST_DS_SELECTOR: 3948 case GUEST_FS_SELECTOR: 3949 case GUEST_GS_SELECTOR: 3950 case GUEST_LDTR_SELECTOR: 3951 case GUEST_TR_SELECTOR: 3952 case GUEST_ES_LIMIT: 3953 case GUEST_CS_LIMIT: 3954 case GUEST_SS_LIMIT: 3955 case GUEST_DS_LIMIT: 3956 case GUEST_FS_LIMIT: 3957 case GUEST_GS_LIMIT: 3958 case GUEST_LDTR_LIMIT: 3959 case GUEST_TR_LIMIT: 3960 case GUEST_GDTR_LIMIT: 3961 case GUEST_IDTR_LIMIT: 3962 case GUEST_ES_AR_BYTES: 3963 case GUEST_DS_AR_BYTES: 3964 case GUEST_FS_AR_BYTES: 3965 case GUEST_GS_AR_BYTES: 3966 case GUEST_LDTR_AR_BYTES: 3967 case GUEST_TR_AR_BYTES: 3968 case GUEST_ES_BASE: 3969 case GUEST_CS_BASE: 3970 case GUEST_SS_BASE: 3971 case GUEST_DS_BASE: 3972 case GUEST_FS_BASE: 3973 case GUEST_GS_BASE: 3974 case GUEST_LDTR_BASE: 3975 case GUEST_TR_BASE: 3976 case GUEST_GDTR_BASE: 3977 case GUEST_IDTR_BASE: 3978 case GUEST_PENDING_DBG_EXCEPTIONS: 3979 case GUEST_BNDCFGS: 3980 return true; 3981 default: 3982 break; 3983 } 3984 3985 return false; 3986 } 3987 3988 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3989 struct vmcs12 *vmcs12) 3990 { 3991 struct vcpu_vmx *vmx = to_vmx(vcpu); 3992 3993 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3994 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3995 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3996 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3997 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3998 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3999 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4000 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4001 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4002 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4003 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4004 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4005 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4006 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4007 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4008 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4009 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4010 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4011 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4012 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4013 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4014 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4015 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4016 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4017 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4018 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4019 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4020 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4021 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4022 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4023 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4024 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4025 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4026 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4027 vmcs12->guest_pending_dbg_exceptions = 4028 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4029 if (kvm_mpx_supported()) 4030 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 4031 4032 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4033 } 4034 4035 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4036 struct vmcs12 *vmcs12) 4037 { 4038 struct vcpu_vmx *vmx = to_vmx(vcpu); 4039 int cpu; 4040 4041 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4042 return; 4043 4044 4045 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4046 4047 cpu = get_cpu(); 4048 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4049 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4050 4051 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4052 4053 vmx->loaded_vmcs = &vmx->vmcs01; 4054 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4055 put_cpu(); 4056 } 4057 4058 /* 4059 * Update the guest state fields of vmcs12 to reflect changes that 4060 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4061 * VM-entry controls is also updated, since this is really a guest 4062 * state bit.) 4063 */ 4064 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4065 { 4066 struct vcpu_vmx *vmx = to_vmx(vcpu); 4067 4068 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 4069 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4070 4071 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4072 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); 4073 4074 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4075 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4076 4077 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4078 vmcs12->guest_rip = kvm_rip_read(vcpu); 4079 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4080 4081 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4082 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4083 4084 vmcs12->guest_interruptibility_info = 4085 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4086 4087 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4088 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4089 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4090 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4091 else 4092 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4093 4094 if (nested_cpu_has_preemption_timer(vmcs12) && 4095 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4096 !vmx->nested.nested_run_pending) 4097 vmcs12->vmx_preemption_timer_value = 4098 vmx_get_preemption_timer_value(vcpu); 4099 4100 /* 4101 * In some cases (usually, nested EPT), L2 is allowed to change its 4102 * own CR3 without exiting. If it has changed it, we must keep it. 4103 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4104 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4105 * 4106 * Additionally, restore L2's PDPTR to vmcs12. 4107 */ 4108 if (enable_ept) { 4109 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4110 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4111 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4112 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4113 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4114 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4115 } 4116 } 4117 4118 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4119 4120 if (nested_cpu_has_vid(vmcs12)) 4121 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4122 4123 vmcs12->vm_entry_controls = 4124 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4125 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4126 4127 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4128 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4129 4130 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4131 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4132 } 4133 4134 /* 4135 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4136 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4137 * and this function updates it to reflect the changes to the guest state while 4138 * L2 was running (and perhaps made some exits which were handled directly by L0 4139 * without going back to L1), and to reflect the exit reason. 4140 * Note that we do not have to copy here all VMCS fields, just those that 4141 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4142 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4143 * which already writes to vmcs12 directly. 4144 */ 4145 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4146 u32 vm_exit_reason, u32 exit_intr_info, 4147 unsigned long exit_qualification) 4148 { 4149 /* update exit information fields: */ 4150 vmcs12->vm_exit_reason = vm_exit_reason; 4151 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4152 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4153 vmcs12->exit_qualification = exit_qualification; 4154 vmcs12->vm_exit_intr_info = exit_intr_info; 4155 4156 vmcs12->idt_vectoring_info_field = 0; 4157 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4158 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4159 4160 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4161 vmcs12->launch_state = 1; 4162 4163 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4164 * instead of reading the real value. */ 4165 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4166 4167 /* 4168 * Transfer the event that L0 or L1 may wanted to inject into 4169 * L2 to IDT_VECTORING_INFO_FIELD. 4170 */ 4171 vmcs12_save_pending_event(vcpu, vmcs12); 4172 4173 /* 4174 * According to spec, there's no need to store the guest's 4175 * MSRs if the exit is due to a VM-entry failure that occurs 4176 * during or after loading the guest state. Since this exit 4177 * does not fall in that category, we need to save the MSRs. 4178 */ 4179 if (nested_vmx_store_msr(vcpu, 4180 vmcs12->vm_exit_msr_store_addr, 4181 vmcs12->vm_exit_msr_store_count)) 4182 nested_vmx_abort(vcpu, 4183 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4184 } 4185 4186 /* 4187 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 4188 * preserved above and would only end up incorrectly in L1. 4189 */ 4190 vcpu->arch.nmi_injected = false; 4191 kvm_clear_exception_queue(vcpu); 4192 kvm_clear_interrupt_queue(vcpu); 4193 } 4194 4195 /* 4196 * A part of what we need to when the nested L2 guest exits and we want to 4197 * run its L1 parent, is to reset L1's guest state to the host state specified 4198 * in vmcs12. 4199 * This function is to be called not only on normal nested exit, but also on 4200 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4201 * Failures During or After Loading Guest State"). 4202 * This function should be called when the active VMCS is L1's (vmcs01). 4203 */ 4204 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4205 struct vmcs12 *vmcs12) 4206 { 4207 enum vm_entry_failure_code ignored; 4208 struct kvm_segment seg; 4209 4210 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4211 vcpu->arch.efer = vmcs12->host_ia32_efer; 4212 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4213 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4214 else 4215 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4216 vmx_set_efer(vcpu, vcpu->arch.efer); 4217 4218 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4219 kvm_rip_write(vcpu, vmcs12->host_rip); 4220 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4221 vmx_set_interrupt_shadow(vcpu, 0); 4222 4223 /* 4224 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4225 * actually changed, because vmx_set_cr0 refers to efer set above. 4226 * 4227 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4228 * (KVM doesn't change it); 4229 */ 4230 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4231 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4232 4233 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4234 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4235 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4236 4237 nested_ept_uninit_mmu_context(vcpu); 4238 4239 /* 4240 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4241 * couldn't have changed. 4242 */ 4243 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4244 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4245 4246 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4247 4248 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4249 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4250 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4251 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4252 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4253 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4254 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4255 4256 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4257 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4258 vmcs_write64(GUEST_BNDCFGS, 0); 4259 4260 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4261 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4262 vcpu->arch.pat = vmcs12->host_ia32_pat; 4263 } 4264 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 4265 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4266 vmcs12->host_ia32_perf_global_ctrl)); 4267 4268 /* Set L1 segment info according to Intel SDM 4269 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4270 seg = (struct kvm_segment) { 4271 .base = 0, 4272 .limit = 0xFFFFFFFF, 4273 .selector = vmcs12->host_cs_selector, 4274 .type = 11, 4275 .present = 1, 4276 .s = 1, 4277 .g = 1 4278 }; 4279 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4280 seg.l = 1; 4281 else 4282 seg.db = 1; 4283 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4284 seg = (struct kvm_segment) { 4285 .base = 0, 4286 .limit = 0xFFFFFFFF, 4287 .type = 3, 4288 .present = 1, 4289 .s = 1, 4290 .db = 1, 4291 .g = 1 4292 }; 4293 seg.selector = vmcs12->host_ds_selector; 4294 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4295 seg.selector = vmcs12->host_es_selector; 4296 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4297 seg.selector = vmcs12->host_ss_selector; 4298 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4299 seg.selector = vmcs12->host_fs_selector; 4300 seg.base = vmcs12->host_fs_base; 4301 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4302 seg.selector = vmcs12->host_gs_selector; 4303 seg.base = vmcs12->host_gs_base; 4304 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4305 seg = (struct kvm_segment) { 4306 .base = vmcs12->host_tr_base, 4307 .limit = 0x67, 4308 .selector = vmcs12->host_tr_selector, 4309 .type = 11, 4310 .present = 1 4311 }; 4312 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4313 4314 memset(&seg, 0, sizeof(seg)); 4315 seg.unusable = 1; 4316 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4317 4318 kvm_set_dr(vcpu, 7, 0x400); 4319 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4320 4321 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4322 vmcs12->vm_exit_msr_load_count)) 4323 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4324 4325 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4326 } 4327 4328 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4329 { 4330 struct vmx_uret_msr *efer_msr; 4331 unsigned int i; 4332 4333 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4334 return vmcs_read64(GUEST_IA32_EFER); 4335 4336 if (cpu_has_load_ia32_efer()) 4337 return host_efer; 4338 4339 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4340 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4341 return vmx->msr_autoload.guest.val[i].value; 4342 } 4343 4344 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4345 if (efer_msr) 4346 return efer_msr->data; 4347 4348 return host_efer; 4349 } 4350 4351 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4352 { 4353 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4354 struct vcpu_vmx *vmx = to_vmx(vcpu); 4355 struct vmx_msr_entry g, h; 4356 gpa_t gpa; 4357 u32 i, j; 4358 4359 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4360 4361 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4362 /* 4363 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4364 * as vmcs01.GUEST_DR7 contains a userspace defined value 4365 * and vcpu->arch.dr7 is not squirreled away before the 4366 * nested VMENTER (not worth adding a variable in nested_vmx). 4367 */ 4368 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4369 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4370 else 4371 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4372 } 4373 4374 /* 4375 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4376 * handle a variety of side effects to KVM's software model. 4377 */ 4378 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4379 4380 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4381 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4382 4383 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4384 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4385 4386 nested_ept_uninit_mmu_context(vcpu); 4387 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4388 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4389 4390 /* 4391 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4392 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4393 * VMFail, like everything else we just need to ensure our 4394 * software model is up-to-date. 4395 */ 4396 if (enable_ept && is_pae_paging(vcpu)) 4397 ept_save_pdptrs(vcpu); 4398 4399 kvm_mmu_reset_context(vcpu); 4400 4401 /* 4402 * This nasty bit of open coding is a compromise between blindly 4403 * loading L1's MSRs using the exit load lists (incorrect emulation 4404 * of VMFail), leaving the nested VM's MSRs in the software model 4405 * (incorrect behavior) and snapshotting the modified MSRs (too 4406 * expensive since the lists are unbound by hardware). For each 4407 * MSR that was (prematurely) loaded from the nested VMEntry load 4408 * list, reload it from the exit load list if it exists and differs 4409 * from the guest value. The intent is to stuff host state as 4410 * silently as possible, not to fully process the exit load list. 4411 */ 4412 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4413 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4414 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4415 pr_debug_ratelimited( 4416 "%s read MSR index failed (%u, 0x%08llx)\n", 4417 __func__, i, gpa); 4418 goto vmabort; 4419 } 4420 4421 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4422 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4423 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4424 pr_debug_ratelimited( 4425 "%s read MSR failed (%u, 0x%08llx)\n", 4426 __func__, j, gpa); 4427 goto vmabort; 4428 } 4429 if (h.index != g.index) 4430 continue; 4431 if (h.value == g.value) 4432 break; 4433 4434 if (nested_vmx_load_msr_check(vcpu, &h)) { 4435 pr_debug_ratelimited( 4436 "%s check failed (%u, 0x%x, 0x%x)\n", 4437 __func__, j, h.index, h.reserved); 4438 goto vmabort; 4439 } 4440 4441 if (kvm_set_msr(vcpu, h.index, h.value)) { 4442 pr_debug_ratelimited( 4443 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4444 __func__, j, h.index, h.value); 4445 goto vmabort; 4446 } 4447 } 4448 } 4449 4450 return; 4451 4452 vmabort: 4453 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4454 } 4455 4456 /* 4457 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4458 * and modify vmcs12 to make it see what it would expect to see there if 4459 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4460 */ 4461 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4462 u32 exit_intr_info, unsigned long exit_qualification) 4463 { 4464 struct vcpu_vmx *vmx = to_vmx(vcpu); 4465 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4466 4467 /* trying to cancel vmlaunch/vmresume is a bug */ 4468 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4469 4470 /* Similarly, triple faults in L2 should never escape. */ 4471 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); 4472 4473 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4474 /* 4475 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4476 * Enlightened VMCS after migration and we still need to 4477 * do that when something is forcing L2->L1 exit prior to 4478 * the first L2 run. 4479 */ 4480 (void)nested_get_evmcs_page(vcpu); 4481 } 4482 4483 /* Service the TLB flush request for L2 before switching to L1. */ 4484 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 4485 kvm_vcpu_flush_tlb_current(vcpu); 4486 4487 /* 4488 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4489 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4490 * up-to-date before switching to L1. 4491 */ 4492 if (enable_ept && is_pae_paging(vcpu)) 4493 vmx_ept_load_pdptrs(vcpu); 4494 4495 leave_guest_mode(vcpu); 4496 4497 if (nested_cpu_has_preemption_timer(vmcs12)) 4498 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4499 4500 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4501 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4502 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4503 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4504 } 4505 4506 if (likely(!vmx->fail)) { 4507 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4508 4509 if (vm_exit_reason != -1) 4510 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4511 exit_intr_info, exit_qualification); 4512 4513 /* 4514 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4515 * also be used to capture vmcs12 cache as part of 4516 * capturing nVMX state for snapshot (migration). 4517 * 4518 * Otherwise, this flush will dirty guest memory at a 4519 * point it is already assumed by user-space to be 4520 * immutable. 4521 */ 4522 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4523 } else { 4524 /* 4525 * The only expected VM-instruction error is "VM entry with 4526 * invalid control field(s)." Anything else indicates a 4527 * problem with L0. And we should never get here with a 4528 * VMFail of any type if early consistency checks are enabled. 4529 */ 4530 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4531 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4532 WARN_ON_ONCE(nested_early_check); 4533 } 4534 4535 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4536 4537 /* Update any VMCS fields that might have changed while L2 ran */ 4538 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4539 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4540 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4541 if (kvm_has_tsc_control) 4542 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4543 4544 if (vmx->nested.l1_tpr_threshold != -1) 4545 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4546 4547 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4548 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4549 vmx_set_virtual_apic_mode(vcpu); 4550 } 4551 4552 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4553 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4554 vmx_update_cpu_dirty_logging(vcpu); 4555 } 4556 4557 /* Unpin physical memory we referred to in vmcs02 */ 4558 if (vmx->nested.apic_access_page) { 4559 kvm_release_page_clean(vmx->nested.apic_access_page); 4560 vmx->nested.apic_access_page = NULL; 4561 } 4562 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4563 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4564 vmx->nested.pi_desc = NULL; 4565 4566 if (vmx->nested.reload_vmcs01_apic_access_page) { 4567 vmx->nested.reload_vmcs01_apic_access_page = false; 4568 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4569 } 4570 4571 if ((vm_exit_reason != -1) && 4572 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) 4573 vmx->nested.need_vmcs12_to_shadow_sync = true; 4574 4575 /* in case we halted in L2 */ 4576 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4577 4578 if (likely(!vmx->fail)) { 4579 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4580 nested_exit_intr_ack_set(vcpu)) { 4581 int irq = kvm_cpu_get_interrupt(vcpu); 4582 WARN_ON(irq < 0); 4583 vmcs12->vm_exit_intr_info = irq | 4584 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4585 } 4586 4587 if (vm_exit_reason != -1) 4588 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4589 vmcs12->exit_qualification, 4590 vmcs12->idt_vectoring_info_field, 4591 vmcs12->vm_exit_intr_info, 4592 vmcs12->vm_exit_intr_error_code, 4593 KVM_ISA_VMX); 4594 4595 load_vmcs12_host_state(vcpu, vmcs12); 4596 4597 return; 4598 } 4599 4600 /* 4601 * After an early L2 VM-entry failure, we're now back 4602 * in L1 which thinks it just finished a VMLAUNCH or 4603 * VMRESUME instruction, so we need to set the failure 4604 * flag and the VM-instruction error field of the VMCS 4605 * accordingly, and skip the emulated instruction. 4606 */ 4607 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4608 4609 /* 4610 * Restore L1's host state to KVM's software model. We're here 4611 * because a consistency check was caught by hardware, which 4612 * means some amount of guest state has been propagated to KVM's 4613 * model and needs to be unwound to the host's state. 4614 */ 4615 nested_vmx_restore_host_state(vcpu); 4616 4617 vmx->fail = 0; 4618 } 4619 4620 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4621 { 4622 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4623 } 4624 4625 /* 4626 * Decode the memory-address operand of a vmx instruction, as recorded on an 4627 * exit caused by such an instruction (run by a guest hypervisor). 4628 * On success, returns 0. When the operand is invalid, returns 1 and throws 4629 * #UD, #GP, or #SS. 4630 */ 4631 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4632 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4633 { 4634 gva_t off; 4635 bool exn; 4636 struct kvm_segment s; 4637 4638 /* 4639 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4640 * Execution", on an exit, vmx_instruction_info holds most of the 4641 * addressing components of the operand. Only the displacement part 4642 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4643 * For how an actual address is calculated from all these components, 4644 * refer to Vol. 1, "Operand Addressing". 4645 */ 4646 int scaling = vmx_instruction_info & 3; 4647 int addr_size = (vmx_instruction_info >> 7) & 7; 4648 bool is_reg = vmx_instruction_info & (1u << 10); 4649 int seg_reg = (vmx_instruction_info >> 15) & 7; 4650 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4651 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4652 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4653 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4654 4655 if (is_reg) { 4656 kvm_queue_exception(vcpu, UD_VECTOR); 4657 return 1; 4658 } 4659 4660 /* Addr = segment_base + offset */ 4661 /* offset = base + [index * scale] + displacement */ 4662 off = exit_qualification; /* holds the displacement */ 4663 if (addr_size == 1) 4664 off = (gva_t)sign_extend64(off, 31); 4665 else if (addr_size == 0) 4666 off = (gva_t)sign_extend64(off, 15); 4667 if (base_is_valid) 4668 off += kvm_register_read(vcpu, base_reg); 4669 if (index_is_valid) 4670 off += kvm_register_read(vcpu, index_reg) << scaling; 4671 vmx_get_segment(vcpu, &s, seg_reg); 4672 4673 /* 4674 * The effective address, i.e. @off, of a memory operand is truncated 4675 * based on the address size of the instruction. Note that this is 4676 * the *effective address*, i.e. the address prior to accounting for 4677 * the segment's base. 4678 */ 4679 if (addr_size == 1) /* 32 bit */ 4680 off &= 0xffffffff; 4681 else if (addr_size == 0) /* 16 bit */ 4682 off &= 0xffff; 4683 4684 /* Checks for #GP/#SS exceptions. */ 4685 exn = false; 4686 if (is_long_mode(vcpu)) { 4687 /* 4688 * The virtual/linear address is never truncated in 64-bit 4689 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4690 * address when using FS/GS with a non-zero base. 4691 */ 4692 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4693 *ret = s.base + off; 4694 else 4695 *ret = off; 4696 4697 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4698 * non-canonical form. This is the only check on the memory 4699 * destination for long mode! 4700 */ 4701 exn = is_noncanonical_address(*ret, vcpu); 4702 } else { 4703 /* 4704 * When not in long mode, the virtual/linear address is 4705 * unconditionally truncated to 32 bits regardless of the 4706 * address size. 4707 */ 4708 *ret = (s.base + off) & 0xffffffff; 4709 4710 /* Protected mode: apply checks for segment validity in the 4711 * following order: 4712 * - segment type check (#GP(0) may be thrown) 4713 * - usability check (#GP(0)/#SS(0)) 4714 * - limit check (#GP(0)/#SS(0)) 4715 */ 4716 if (wr) 4717 /* #GP(0) if the destination operand is located in a 4718 * read-only data segment or any code segment. 4719 */ 4720 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4721 else 4722 /* #GP(0) if the source operand is located in an 4723 * execute-only code segment 4724 */ 4725 exn = ((s.type & 0xa) == 8); 4726 if (exn) { 4727 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4728 return 1; 4729 } 4730 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4731 */ 4732 exn = (s.unusable != 0); 4733 4734 /* 4735 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4736 * outside the segment limit. All CPUs that support VMX ignore 4737 * limit checks for flat segments, i.e. segments with base==0, 4738 * limit==0xffffffff and of type expand-up data or code. 4739 */ 4740 if (!(s.base == 0 && s.limit == 0xffffffff && 4741 ((s.type & 8) || !(s.type & 4)))) 4742 exn = exn || ((u64)off + len - 1 > s.limit); 4743 } 4744 if (exn) { 4745 kvm_queue_exception_e(vcpu, 4746 seg_reg == VCPU_SREG_SS ? 4747 SS_VECTOR : GP_VECTOR, 4748 0); 4749 return 1; 4750 } 4751 4752 return 0; 4753 } 4754 4755 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 4756 { 4757 struct vcpu_vmx *vmx; 4758 4759 if (!nested_vmx_allowed(vcpu)) 4760 return; 4761 4762 vmx = to_vmx(vcpu); 4763 if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { 4764 vmx->nested.msrs.entry_ctls_high |= 4765 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4766 vmx->nested.msrs.exit_ctls_high |= 4767 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4768 } else { 4769 vmx->nested.msrs.entry_ctls_high &= 4770 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4771 vmx->nested.msrs.exit_ctls_high &= 4772 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4773 } 4774 } 4775 4776 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 4777 int *ret) 4778 { 4779 gva_t gva; 4780 struct x86_exception e; 4781 int r; 4782 4783 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 4784 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4785 sizeof(*vmpointer), &gva)) { 4786 *ret = 1; 4787 return -EINVAL; 4788 } 4789 4790 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 4791 if (r != X86EMUL_CONTINUE) { 4792 *ret = kvm_handle_memory_failure(vcpu, r, &e); 4793 return -EINVAL; 4794 } 4795 4796 return 0; 4797 } 4798 4799 /* 4800 * Allocate a shadow VMCS and associate it with the currently loaded 4801 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4802 * VMCS is also VMCLEARed, so that it is ready for use. 4803 */ 4804 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4805 { 4806 struct vcpu_vmx *vmx = to_vmx(vcpu); 4807 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4808 4809 /* 4810 * We should allocate a shadow vmcs for vmcs01 only when L1 4811 * executes VMXON and free it when L1 executes VMXOFF. 4812 * As it is invalid to execute VMXON twice, we shouldn't reach 4813 * here when vmcs01 already have an allocated shadow vmcs. 4814 */ 4815 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4816 4817 if (!loaded_vmcs->shadow_vmcs) { 4818 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4819 if (loaded_vmcs->shadow_vmcs) 4820 vmcs_clear(loaded_vmcs->shadow_vmcs); 4821 } 4822 return loaded_vmcs->shadow_vmcs; 4823 } 4824 4825 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4826 { 4827 struct vcpu_vmx *vmx = to_vmx(vcpu); 4828 int r; 4829 4830 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4831 if (r < 0) 4832 goto out_vmcs02; 4833 4834 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4835 if (!vmx->nested.cached_vmcs12) 4836 goto out_cached_vmcs12; 4837 4838 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4839 if (!vmx->nested.cached_shadow_vmcs12) 4840 goto out_cached_shadow_vmcs12; 4841 4842 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4843 goto out_shadow_vmcs; 4844 4845 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4846 HRTIMER_MODE_ABS_PINNED); 4847 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4848 4849 vmx->nested.vpid02 = allocate_vpid(); 4850 4851 vmx->nested.vmcs02_initialized = false; 4852 vmx->nested.vmxon = true; 4853 4854 if (vmx_pt_mode_is_host_guest()) { 4855 vmx->pt_desc.guest.ctl = 0; 4856 pt_update_intercept_for_msr(vcpu); 4857 } 4858 4859 return 0; 4860 4861 out_shadow_vmcs: 4862 kfree(vmx->nested.cached_shadow_vmcs12); 4863 4864 out_cached_shadow_vmcs12: 4865 kfree(vmx->nested.cached_vmcs12); 4866 4867 out_cached_vmcs12: 4868 free_loaded_vmcs(&vmx->nested.vmcs02); 4869 4870 out_vmcs02: 4871 return -ENOMEM; 4872 } 4873 4874 /* Emulate the VMXON instruction. */ 4875 static int handle_vmon(struct kvm_vcpu *vcpu) 4876 { 4877 int ret; 4878 gpa_t vmptr; 4879 uint32_t revision; 4880 struct vcpu_vmx *vmx = to_vmx(vcpu); 4881 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 4882 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 4883 4884 /* 4885 * The Intel VMX Instruction Reference lists a bunch of bits that are 4886 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4887 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). 4888 * Otherwise, we should fail with #UD. But most faulting conditions 4889 * have already been checked by hardware, prior to the VM-exit for 4890 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4891 * that bit set to 1 in non-root mode. 4892 */ 4893 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4894 kvm_queue_exception(vcpu, UD_VECTOR); 4895 return 1; 4896 } 4897 4898 /* CPL=0 must be checked manually. */ 4899 if (vmx_get_cpl(vcpu)) { 4900 kvm_inject_gp(vcpu, 0); 4901 return 1; 4902 } 4903 4904 if (vmx->nested.vmxon) 4905 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4906 4907 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4908 != VMXON_NEEDED_FEATURES) { 4909 kvm_inject_gp(vcpu, 0); 4910 return 1; 4911 } 4912 4913 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 4914 return ret; 4915 4916 /* 4917 * SDM 3: 24.11.5 4918 * The first 4 bytes of VMXON region contain the supported 4919 * VMCS revision identifier 4920 * 4921 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4922 * which replaces physical address width with 32 4923 */ 4924 if (!page_address_valid(vcpu, vmptr)) 4925 return nested_vmx_failInvalid(vcpu); 4926 4927 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4928 revision != VMCS12_REVISION) 4929 return nested_vmx_failInvalid(vcpu); 4930 4931 vmx->nested.vmxon_ptr = vmptr; 4932 ret = enter_vmx_operation(vcpu); 4933 if (ret) 4934 return ret; 4935 4936 return nested_vmx_succeed(vcpu); 4937 } 4938 4939 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4940 { 4941 struct vcpu_vmx *vmx = to_vmx(vcpu); 4942 4943 if (vmx->nested.current_vmptr == INVALID_GPA) 4944 return; 4945 4946 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4947 4948 if (enable_shadow_vmcs) { 4949 /* copy to memory all shadowed fields in case 4950 they were modified */ 4951 copy_shadow_to_vmcs12(vmx); 4952 vmx_disable_shadow_vmcs(vmx); 4953 } 4954 vmx->nested.posted_intr_nv = -1; 4955 4956 /* Flush VMCS12 to guest memory */ 4957 kvm_vcpu_write_guest_page(vcpu, 4958 vmx->nested.current_vmptr >> PAGE_SHIFT, 4959 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4960 4961 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4962 4963 vmx->nested.current_vmptr = INVALID_GPA; 4964 } 4965 4966 /* Emulate the VMXOFF instruction */ 4967 static int handle_vmoff(struct kvm_vcpu *vcpu) 4968 { 4969 if (!nested_vmx_check_permission(vcpu)) 4970 return 1; 4971 4972 free_nested(vcpu); 4973 4974 /* Process a latched INIT during time CPU was in VMX operation */ 4975 kvm_make_request(KVM_REQ_EVENT, vcpu); 4976 4977 return nested_vmx_succeed(vcpu); 4978 } 4979 4980 /* Emulate the VMCLEAR instruction */ 4981 static int handle_vmclear(struct kvm_vcpu *vcpu) 4982 { 4983 struct vcpu_vmx *vmx = to_vmx(vcpu); 4984 u32 zero = 0; 4985 gpa_t vmptr; 4986 u64 evmcs_gpa; 4987 int r; 4988 4989 if (!nested_vmx_check_permission(vcpu)) 4990 return 1; 4991 4992 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 4993 return r; 4994 4995 if (!page_address_valid(vcpu, vmptr)) 4996 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 4997 4998 if (vmptr == vmx->nested.vmxon_ptr) 4999 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5000 5001 /* 5002 * When Enlightened VMEntry is enabled on the calling CPU we treat 5003 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 5004 * way to distinguish it from VMCS12) and we must not corrupt it by 5005 * writing to the non-existent 'launch_state' field. The area doesn't 5006 * have to be the currently active EVMCS on the calling CPU and there's 5007 * nothing KVM has to do to transition it from 'active' to 'non-active' 5008 * state. It is possible that the area will stay mapped as 5009 * vmx->nested.hv_evmcs but this shouldn't be a problem. 5010 */ 5011 if (likely(!vmx->nested.enlightened_vmcs_enabled || 5012 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 5013 if (vmptr == vmx->nested.current_vmptr) 5014 nested_release_vmcs12(vcpu); 5015 5016 kvm_vcpu_write_guest(vcpu, 5017 vmptr + offsetof(struct vmcs12, 5018 launch_state), 5019 &zero, sizeof(zero)); 5020 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { 5021 nested_release_evmcs(vcpu); 5022 } 5023 5024 return nested_vmx_succeed(vcpu); 5025 } 5026 5027 /* Emulate the VMLAUNCH instruction */ 5028 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5029 { 5030 return nested_vmx_run(vcpu, true); 5031 } 5032 5033 /* Emulate the VMRESUME instruction */ 5034 static int handle_vmresume(struct kvm_vcpu *vcpu) 5035 { 5036 5037 return nested_vmx_run(vcpu, false); 5038 } 5039 5040 static int handle_vmread(struct kvm_vcpu *vcpu) 5041 { 5042 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5043 : get_vmcs12(vcpu); 5044 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5045 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5046 struct vcpu_vmx *vmx = to_vmx(vcpu); 5047 struct x86_exception e; 5048 unsigned long field; 5049 u64 value; 5050 gva_t gva = 0; 5051 short offset; 5052 int len, r; 5053 5054 if (!nested_vmx_check_permission(vcpu)) 5055 return 1; 5056 5057 /* 5058 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5059 * any VMREAD sets the ALU flags for VMfailInvalid. 5060 */ 5061 if (vmx->nested.current_vmptr == INVALID_GPA || 5062 (is_guest_mode(vcpu) && 5063 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5064 return nested_vmx_failInvalid(vcpu); 5065 5066 /* Decode instruction info and find the field to read */ 5067 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5068 5069 offset = vmcs_field_to_offset(field); 5070 if (offset < 0) 5071 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5072 5073 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5074 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5075 5076 /* Read the field, zero-extended to a u64 value */ 5077 value = vmcs12_read_any(vmcs12, field, offset); 5078 5079 /* 5080 * Now copy part of this value to register or memory, as requested. 5081 * Note that the number of bits actually copied is 32 or 64 depending 5082 * on the guest's mode (32 or 64 bit), not on the given field's length. 5083 */ 5084 if (instr_info & BIT(10)) { 5085 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5086 } else { 5087 len = is_64_bit_mode(vcpu) ? 8 : 4; 5088 if (get_vmx_mem_address(vcpu, exit_qualification, 5089 instr_info, true, len, &gva)) 5090 return 1; 5091 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5092 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5093 if (r != X86EMUL_CONTINUE) 5094 return kvm_handle_memory_failure(vcpu, r, &e); 5095 } 5096 5097 return nested_vmx_succeed(vcpu); 5098 } 5099 5100 static bool is_shadow_field_rw(unsigned long field) 5101 { 5102 switch (field) { 5103 #define SHADOW_FIELD_RW(x, y) case x: 5104 #include "vmcs_shadow_fields.h" 5105 return true; 5106 default: 5107 break; 5108 } 5109 return false; 5110 } 5111 5112 static bool is_shadow_field_ro(unsigned long field) 5113 { 5114 switch (field) { 5115 #define SHADOW_FIELD_RO(x, y) case x: 5116 #include "vmcs_shadow_fields.h" 5117 return true; 5118 default: 5119 break; 5120 } 5121 return false; 5122 } 5123 5124 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5125 { 5126 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5127 : get_vmcs12(vcpu); 5128 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5129 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5130 struct vcpu_vmx *vmx = to_vmx(vcpu); 5131 struct x86_exception e; 5132 unsigned long field; 5133 short offset; 5134 gva_t gva; 5135 int len, r; 5136 5137 /* 5138 * The value to write might be 32 or 64 bits, depending on L1's long 5139 * mode, and eventually we need to write that into a field of several 5140 * possible lengths. The code below first zero-extends the value to 64 5141 * bit (value), and then copies only the appropriate number of 5142 * bits into the vmcs12 field. 5143 */ 5144 u64 value = 0; 5145 5146 if (!nested_vmx_check_permission(vcpu)) 5147 return 1; 5148 5149 /* 5150 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5151 * any VMWRITE sets the ALU flags for VMfailInvalid. 5152 */ 5153 if (vmx->nested.current_vmptr == INVALID_GPA || 5154 (is_guest_mode(vcpu) && 5155 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5156 return nested_vmx_failInvalid(vcpu); 5157 5158 if (instr_info & BIT(10)) 5159 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5160 else { 5161 len = is_64_bit_mode(vcpu) ? 8 : 4; 5162 if (get_vmx_mem_address(vcpu, exit_qualification, 5163 instr_info, false, len, &gva)) 5164 return 1; 5165 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5166 if (r != X86EMUL_CONTINUE) 5167 return kvm_handle_memory_failure(vcpu, r, &e); 5168 } 5169 5170 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5171 5172 offset = vmcs_field_to_offset(field); 5173 if (offset < 0) 5174 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5175 5176 /* 5177 * If the vCPU supports "VMWRITE to any supported field in the 5178 * VMCS," then the "read-only" fields are actually read/write. 5179 */ 5180 if (vmcs_field_readonly(field) && 5181 !nested_cpu_has_vmwrite_any_field(vcpu)) 5182 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5183 5184 /* 5185 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5186 * vmcs12, else we may crush a field or consume a stale value. 5187 */ 5188 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5189 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5190 5191 /* 5192 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5193 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5194 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5195 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5196 * from L1 will return a different value than VMREAD from L2 (L1 sees 5197 * the stripped down value, L2 sees the full value as stored by KVM). 5198 */ 5199 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5200 value &= 0x1f0ff; 5201 5202 vmcs12_write_any(vmcs12, field, offset, value); 5203 5204 /* 5205 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5206 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5207 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5208 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5209 */ 5210 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5211 /* 5212 * L1 can read these fields without exiting, ensure the 5213 * shadow VMCS is up-to-date. 5214 */ 5215 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5216 preempt_disable(); 5217 vmcs_load(vmx->vmcs01.shadow_vmcs); 5218 5219 __vmcs_writel(field, value); 5220 5221 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5222 vmcs_load(vmx->loaded_vmcs->vmcs); 5223 preempt_enable(); 5224 } 5225 vmx->nested.dirty_vmcs12 = true; 5226 } 5227 5228 return nested_vmx_succeed(vcpu); 5229 } 5230 5231 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5232 { 5233 vmx->nested.current_vmptr = vmptr; 5234 if (enable_shadow_vmcs) { 5235 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5236 vmcs_write64(VMCS_LINK_POINTER, 5237 __pa(vmx->vmcs01.shadow_vmcs)); 5238 vmx->nested.need_vmcs12_to_shadow_sync = true; 5239 } 5240 vmx->nested.dirty_vmcs12 = true; 5241 } 5242 5243 /* Emulate the VMPTRLD instruction */ 5244 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5245 { 5246 struct vcpu_vmx *vmx = to_vmx(vcpu); 5247 gpa_t vmptr; 5248 int r; 5249 5250 if (!nested_vmx_check_permission(vcpu)) 5251 return 1; 5252 5253 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5254 return r; 5255 5256 if (!page_address_valid(vcpu, vmptr)) 5257 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5258 5259 if (vmptr == vmx->nested.vmxon_ptr) 5260 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5261 5262 /* Forbid normal VMPTRLD if Enlightened version was used */ 5263 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 5264 return 1; 5265 5266 if (vmx->nested.current_vmptr != vmptr) { 5267 struct kvm_host_map map; 5268 struct vmcs12 *new_vmcs12; 5269 5270 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 5271 /* 5272 * Reads from an unbacked page return all 1s, 5273 * which means that the 32 bits located at the 5274 * given physical address won't match the required 5275 * VMCS12_REVISION identifier. 5276 */ 5277 return nested_vmx_fail(vcpu, 5278 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5279 } 5280 5281 new_vmcs12 = map.hva; 5282 5283 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 5284 (new_vmcs12->hdr.shadow_vmcs && 5285 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5286 kvm_vcpu_unmap(vcpu, &map, false); 5287 return nested_vmx_fail(vcpu, 5288 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5289 } 5290 5291 nested_release_vmcs12(vcpu); 5292 5293 /* 5294 * Load VMCS12 from guest memory since it is not already 5295 * cached. 5296 */ 5297 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 5298 kvm_vcpu_unmap(vcpu, &map, false); 5299 5300 set_current_vmptr(vmx, vmptr); 5301 } 5302 5303 return nested_vmx_succeed(vcpu); 5304 } 5305 5306 /* Emulate the VMPTRST instruction */ 5307 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5308 { 5309 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5310 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5311 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5312 struct x86_exception e; 5313 gva_t gva; 5314 int r; 5315 5316 if (!nested_vmx_check_permission(vcpu)) 5317 return 1; 5318 5319 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) 5320 return 1; 5321 5322 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5323 true, sizeof(gpa_t), &gva)) 5324 return 1; 5325 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5326 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5327 sizeof(gpa_t), &e); 5328 if (r != X86EMUL_CONTINUE) 5329 return kvm_handle_memory_failure(vcpu, r, &e); 5330 5331 return nested_vmx_succeed(vcpu); 5332 } 5333 5334 /* Emulate the INVEPT instruction */ 5335 static int handle_invept(struct kvm_vcpu *vcpu) 5336 { 5337 struct vcpu_vmx *vmx = to_vmx(vcpu); 5338 u32 vmx_instruction_info, types; 5339 unsigned long type, roots_to_free; 5340 struct kvm_mmu *mmu; 5341 gva_t gva; 5342 struct x86_exception e; 5343 struct { 5344 u64 eptp, gpa; 5345 } operand; 5346 int i, r, gpr_index; 5347 5348 if (!(vmx->nested.msrs.secondary_ctls_high & 5349 SECONDARY_EXEC_ENABLE_EPT) || 5350 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5351 kvm_queue_exception(vcpu, UD_VECTOR); 5352 return 1; 5353 } 5354 5355 if (!nested_vmx_check_permission(vcpu)) 5356 return 1; 5357 5358 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5359 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5360 type = kvm_register_read(vcpu, gpr_index); 5361 5362 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5363 5364 if (type >= 32 || !(types & (1 << type))) 5365 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5366 5367 /* According to the Intel VMX instruction reference, the memory 5368 * operand is read even if it isn't needed (e.g., for type==global) 5369 */ 5370 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5371 vmx_instruction_info, false, sizeof(operand), &gva)) 5372 return 1; 5373 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5374 if (r != X86EMUL_CONTINUE) 5375 return kvm_handle_memory_failure(vcpu, r, &e); 5376 5377 /* 5378 * Nested EPT roots are always held through guest_mmu, 5379 * not root_mmu. 5380 */ 5381 mmu = &vcpu->arch.guest_mmu; 5382 5383 switch (type) { 5384 case VMX_EPT_EXTENT_CONTEXT: 5385 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5386 return nested_vmx_fail(vcpu, 5387 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5388 5389 roots_to_free = 0; 5390 if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, 5391 operand.eptp)) 5392 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5393 5394 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5395 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5396 mmu->prev_roots[i].pgd, 5397 operand.eptp)) 5398 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5399 } 5400 break; 5401 case VMX_EPT_EXTENT_GLOBAL: 5402 roots_to_free = KVM_MMU_ROOTS_ALL; 5403 break; 5404 default: 5405 BUG(); 5406 break; 5407 } 5408 5409 if (roots_to_free) 5410 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 5411 5412 return nested_vmx_succeed(vcpu); 5413 } 5414 5415 static int handle_invvpid(struct kvm_vcpu *vcpu) 5416 { 5417 struct vcpu_vmx *vmx = to_vmx(vcpu); 5418 u32 vmx_instruction_info; 5419 unsigned long type, types; 5420 gva_t gva; 5421 struct x86_exception e; 5422 struct { 5423 u64 vpid; 5424 u64 gla; 5425 } operand; 5426 u16 vpid02; 5427 int r, gpr_index; 5428 5429 if (!(vmx->nested.msrs.secondary_ctls_high & 5430 SECONDARY_EXEC_ENABLE_VPID) || 5431 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5432 kvm_queue_exception(vcpu, UD_VECTOR); 5433 return 1; 5434 } 5435 5436 if (!nested_vmx_check_permission(vcpu)) 5437 return 1; 5438 5439 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5440 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5441 type = kvm_register_read(vcpu, gpr_index); 5442 5443 types = (vmx->nested.msrs.vpid_caps & 5444 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5445 5446 if (type >= 32 || !(types & (1 << type))) 5447 return nested_vmx_fail(vcpu, 5448 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5449 5450 /* according to the intel vmx instruction reference, the memory 5451 * operand is read even if it isn't needed (e.g., for type==global) 5452 */ 5453 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5454 vmx_instruction_info, false, sizeof(operand), &gva)) 5455 return 1; 5456 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5457 if (r != X86EMUL_CONTINUE) 5458 return kvm_handle_memory_failure(vcpu, r, &e); 5459 5460 if (operand.vpid >> 16) 5461 return nested_vmx_fail(vcpu, 5462 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5463 5464 vpid02 = nested_get_vpid02(vcpu); 5465 switch (type) { 5466 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5467 if (!operand.vpid || 5468 is_noncanonical_address(operand.gla, vcpu)) 5469 return nested_vmx_fail(vcpu, 5470 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5471 vpid_sync_vcpu_addr(vpid02, operand.gla); 5472 break; 5473 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5474 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5475 if (!operand.vpid) 5476 return nested_vmx_fail(vcpu, 5477 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5478 vpid_sync_context(vpid02); 5479 break; 5480 case VMX_VPID_EXTENT_ALL_CONTEXT: 5481 vpid_sync_context(vpid02); 5482 break; 5483 default: 5484 WARN_ON_ONCE(1); 5485 return kvm_skip_emulated_instruction(vcpu); 5486 } 5487 5488 /* 5489 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5490 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5491 * roots as VPIDs are not tracked in the MMU role. 5492 * 5493 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5494 * an MMU when EPT is disabled. 5495 * 5496 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5497 */ 5498 if (!enable_ept) 5499 kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu); 5500 5501 return nested_vmx_succeed(vcpu); 5502 } 5503 5504 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5505 struct vmcs12 *vmcs12) 5506 { 5507 u32 index = kvm_rcx_read(vcpu); 5508 u64 new_eptp; 5509 5510 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5511 return 1; 5512 if (index >= VMFUNC_EPTP_ENTRIES) 5513 return 1; 5514 5515 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5516 &new_eptp, index * 8, 8)) 5517 return 1; 5518 5519 /* 5520 * If the (L2) guest does a vmfunc to the currently 5521 * active ept pointer, we don't have to do anything else 5522 */ 5523 if (vmcs12->ept_pointer != new_eptp) { 5524 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5525 return 1; 5526 5527 vmcs12->ept_pointer = new_eptp; 5528 nested_ept_new_eptp(vcpu); 5529 5530 if (!nested_cpu_has_vpid(vmcs12)) 5531 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5532 } 5533 5534 return 0; 5535 } 5536 5537 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5538 { 5539 struct vcpu_vmx *vmx = to_vmx(vcpu); 5540 struct vmcs12 *vmcs12; 5541 u32 function = kvm_rax_read(vcpu); 5542 5543 /* 5544 * VMFUNC is only supported for nested guests, but we always enable the 5545 * secondary control for simplicity; for non-nested mode, fake that we 5546 * didn't by injecting #UD. 5547 */ 5548 if (!is_guest_mode(vcpu)) { 5549 kvm_queue_exception(vcpu, UD_VECTOR); 5550 return 1; 5551 } 5552 5553 vmcs12 = get_vmcs12(vcpu); 5554 5555 /* 5556 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5557 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5558 */ 5559 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5560 kvm_queue_exception(vcpu, UD_VECTOR); 5561 return 1; 5562 } 5563 5564 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5565 goto fail; 5566 5567 switch (function) { 5568 case 0: 5569 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5570 goto fail; 5571 break; 5572 default: 5573 goto fail; 5574 } 5575 return kvm_skip_emulated_instruction(vcpu); 5576 5577 fail: 5578 /* 5579 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5580 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5581 * EXIT_REASON_VMFUNC as the exit reason. 5582 */ 5583 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5584 vmx_get_intr_info(vcpu), 5585 vmx_get_exit_qual(vcpu)); 5586 return 1; 5587 } 5588 5589 /* 5590 * Return true if an IO instruction with the specified port and size should cause 5591 * a VM-exit into L1. 5592 */ 5593 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5594 int size) 5595 { 5596 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5597 gpa_t bitmap, last_bitmap; 5598 u8 b; 5599 5600 last_bitmap = INVALID_GPA; 5601 b = -1; 5602 5603 while (size > 0) { 5604 if (port < 0x8000) 5605 bitmap = vmcs12->io_bitmap_a; 5606 else if (port < 0x10000) 5607 bitmap = vmcs12->io_bitmap_b; 5608 else 5609 return true; 5610 bitmap += (port & 0x7fff) / 8; 5611 5612 if (last_bitmap != bitmap) 5613 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5614 return true; 5615 if (b & (1 << (port & 7))) 5616 return true; 5617 5618 port++; 5619 size--; 5620 last_bitmap = bitmap; 5621 } 5622 5623 return false; 5624 } 5625 5626 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5627 struct vmcs12 *vmcs12) 5628 { 5629 unsigned long exit_qualification; 5630 unsigned short port; 5631 int size; 5632 5633 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5634 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5635 5636 exit_qualification = vmx_get_exit_qual(vcpu); 5637 5638 port = exit_qualification >> 16; 5639 size = (exit_qualification & 7) + 1; 5640 5641 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5642 } 5643 5644 /* 5645 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5646 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5647 * disinterest in the current event (read or write a specific MSR) by using an 5648 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5649 */ 5650 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5651 struct vmcs12 *vmcs12, 5652 union vmx_exit_reason exit_reason) 5653 { 5654 u32 msr_index = kvm_rcx_read(vcpu); 5655 gpa_t bitmap; 5656 5657 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5658 return true; 5659 5660 /* 5661 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5662 * for the four combinations of read/write and low/high MSR numbers. 5663 * First we need to figure out which of the four to use: 5664 */ 5665 bitmap = vmcs12->msr_bitmap; 5666 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5667 bitmap += 2048; 5668 if (msr_index >= 0xc0000000) { 5669 msr_index -= 0xc0000000; 5670 bitmap += 1024; 5671 } 5672 5673 /* Then read the msr_index'th bit from this bitmap: */ 5674 if (msr_index < 1024*8) { 5675 unsigned char b; 5676 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5677 return true; 5678 return 1 & (b >> (msr_index & 7)); 5679 } else 5680 return true; /* let L1 handle the wrong parameter */ 5681 } 5682 5683 /* 5684 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5685 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5686 * intercept (via guest_host_mask etc.) the current event. 5687 */ 5688 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5689 struct vmcs12 *vmcs12) 5690 { 5691 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5692 int cr = exit_qualification & 15; 5693 int reg; 5694 unsigned long val; 5695 5696 switch ((exit_qualification >> 4) & 3) { 5697 case 0: /* mov to cr */ 5698 reg = (exit_qualification >> 8) & 15; 5699 val = kvm_register_read(vcpu, reg); 5700 switch (cr) { 5701 case 0: 5702 if (vmcs12->cr0_guest_host_mask & 5703 (val ^ vmcs12->cr0_read_shadow)) 5704 return true; 5705 break; 5706 case 3: 5707 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5708 return true; 5709 break; 5710 case 4: 5711 if (vmcs12->cr4_guest_host_mask & 5712 (vmcs12->cr4_read_shadow ^ val)) 5713 return true; 5714 break; 5715 case 8: 5716 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5717 return true; 5718 break; 5719 } 5720 break; 5721 case 2: /* clts */ 5722 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5723 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5724 return true; 5725 break; 5726 case 1: /* mov from cr */ 5727 switch (cr) { 5728 case 3: 5729 if (vmcs12->cpu_based_vm_exec_control & 5730 CPU_BASED_CR3_STORE_EXITING) 5731 return true; 5732 break; 5733 case 8: 5734 if (vmcs12->cpu_based_vm_exec_control & 5735 CPU_BASED_CR8_STORE_EXITING) 5736 return true; 5737 break; 5738 } 5739 break; 5740 case 3: /* lmsw */ 5741 /* 5742 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5743 * cr0. Other attempted changes are ignored, with no exit. 5744 */ 5745 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5746 if (vmcs12->cr0_guest_host_mask & 0xe & 5747 (val ^ vmcs12->cr0_read_shadow)) 5748 return true; 5749 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5750 !(vmcs12->cr0_read_shadow & 0x1) && 5751 (val & 0x1)) 5752 return true; 5753 break; 5754 } 5755 return false; 5756 } 5757 5758 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 5759 struct vmcs12 *vmcs12) 5760 { 5761 u32 encls_leaf; 5762 5763 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 5764 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 5765 return false; 5766 5767 encls_leaf = kvm_rax_read(vcpu); 5768 if (encls_leaf > 62) 5769 encls_leaf = 63; 5770 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 5771 } 5772 5773 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5774 struct vmcs12 *vmcs12, gpa_t bitmap) 5775 { 5776 u32 vmx_instruction_info; 5777 unsigned long field; 5778 u8 b; 5779 5780 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5781 return true; 5782 5783 /* Decode instruction info and find the field to access */ 5784 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5785 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5786 5787 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5788 if (field >> 15) 5789 return true; 5790 5791 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5792 return true; 5793 5794 return 1 & (b >> (field & 7)); 5795 } 5796 5797 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 5798 { 5799 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 5800 5801 if (nested_cpu_has_mtf(vmcs12)) 5802 return true; 5803 5804 /* 5805 * An MTF VM-exit may be injected into the guest by setting the 5806 * interruption-type to 7 (other event) and the vector field to 0. Such 5807 * is the case regardless of the 'monitor trap flag' VM-execution 5808 * control. 5809 */ 5810 return entry_intr_info == (INTR_INFO_VALID_MASK 5811 | INTR_TYPE_OTHER_EVENT); 5812 } 5813 5814 /* 5815 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 5816 * L1 wants the exit. Only call this when in is_guest_mode (L2). 5817 */ 5818 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 5819 union vmx_exit_reason exit_reason) 5820 { 5821 u32 intr_info; 5822 5823 switch ((u16)exit_reason.basic) { 5824 case EXIT_REASON_EXCEPTION_NMI: 5825 intr_info = vmx_get_intr_info(vcpu); 5826 if (is_nmi(intr_info)) 5827 return true; 5828 else if (is_page_fault(intr_info)) 5829 return vcpu->arch.apf.host_apf_flags || 5830 vmx_need_pf_intercept(vcpu); 5831 else if (is_debug(intr_info) && 5832 vcpu->guest_debug & 5833 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5834 return true; 5835 else if (is_breakpoint(intr_info) && 5836 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5837 return true; 5838 else if (is_alignment_check(intr_info) && 5839 !vmx_guest_inject_ac(vcpu)) 5840 return true; 5841 return false; 5842 case EXIT_REASON_EXTERNAL_INTERRUPT: 5843 return true; 5844 case EXIT_REASON_MCE_DURING_VMENTRY: 5845 return true; 5846 case EXIT_REASON_EPT_VIOLATION: 5847 /* 5848 * L0 always deals with the EPT violation. If nested EPT is 5849 * used, and the nested mmu code discovers that the address is 5850 * missing in the guest EPT table (EPT12), the EPT violation 5851 * will be injected with nested_ept_inject_page_fault() 5852 */ 5853 return true; 5854 case EXIT_REASON_EPT_MISCONFIG: 5855 /* 5856 * L2 never uses directly L1's EPT, but rather L0's own EPT 5857 * table (shadow on EPT) or a merged EPT table that L0 built 5858 * (EPT on EPT). So any problems with the structure of the 5859 * table is L0's fault. 5860 */ 5861 return true; 5862 case EXIT_REASON_PREEMPTION_TIMER: 5863 return true; 5864 case EXIT_REASON_PML_FULL: 5865 /* 5866 * PML is emulated for an L1 VMM and should never be enabled in 5867 * vmcs02, always "handle" PML_FULL by exiting to userspace. 5868 */ 5869 return true; 5870 case EXIT_REASON_VMFUNC: 5871 /* VM functions are emulated through L2->L0 vmexits. */ 5872 return true; 5873 case EXIT_REASON_BUS_LOCK: 5874 /* 5875 * At present, bus lock VM exit is never exposed to L1. 5876 * Handle L2's bus locks in L0 directly. 5877 */ 5878 return true; 5879 default: 5880 break; 5881 } 5882 return false; 5883 } 5884 5885 /* 5886 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 5887 * is_guest_mode (L2). 5888 */ 5889 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 5890 union vmx_exit_reason exit_reason) 5891 { 5892 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5893 u32 intr_info; 5894 5895 switch ((u16)exit_reason.basic) { 5896 case EXIT_REASON_EXCEPTION_NMI: 5897 intr_info = vmx_get_intr_info(vcpu); 5898 if (is_nmi(intr_info)) 5899 return true; 5900 else if (is_page_fault(intr_info)) 5901 return true; 5902 return vmcs12->exception_bitmap & 5903 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5904 case EXIT_REASON_EXTERNAL_INTERRUPT: 5905 return nested_exit_on_intr(vcpu); 5906 case EXIT_REASON_TRIPLE_FAULT: 5907 return true; 5908 case EXIT_REASON_INTERRUPT_WINDOW: 5909 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 5910 case EXIT_REASON_NMI_WINDOW: 5911 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 5912 case EXIT_REASON_TASK_SWITCH: 5913 return true; 5914 case EXIT_REASON_CPUID: 5915 return true; 5916 case EXIT_REASON_HLT: 5917 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5918 case EXIT_REASON_INVD: 5919 return true; 5920 case EXIT_REASON_INVLPG: 5921 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5922 case EXIT_REASON_RDPMC: 5923 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5924 case EXIT_REASON_RDRAND: 5925 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5926 case EXIT_REASON_RDSEED: 5927 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5928 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5929 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5930 case EXIT_REASON_VMREAD: 5931 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5932 vmcs12->vmread_bitmap); 5933 case EXIT_REASON_VMWRITE: 5934 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5935 vmcs12->vmwrite_bitmap); 5936 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5937 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5938 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5939 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5940 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5941 /* 5942 * VMX instructions trap unconditionally. This allows L1 to 5943 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5944 */ 5945 return true; 5946 case EXIT_REASON_CR_ACCESS: 5947 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5948 case EXIT_REASON_DR_ACCESS: 5949 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5950 case EXIT_REASON_IO_INSTRUCTION: 5951 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5952 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5953 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5954 case EXIT_REASON_MSR_READ: 5955 case EXIT_REASON_MSR_WRITE: 5956 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5957 case EXIT_REASON_INVALID_STATE: 5958 return true; 5959 case EXIT_REASON_MWAIT_INSTRUCTION: 5960 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5961 case EXIT_REASON_MONITOR_TRAP_FLAG: 5962 return nested_vmx_exit_handled_mtf(vmcs12); 5963 case EXIT_REASON_MONITOR_INSTRUCTION: 5964 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5965 case EXIT_REASON_PAUSE_INSTRUCTION: 5966 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5967 nested_cpu_has2(vmcs12, 5968 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5969 case EXIT_REASON_MCE_DURING_VMENTRY: 5970 return true; 5971 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5972 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5973 case EXIT_REASON_APIC_ACCESS: 5974 case EXIT_REASON_APIC_WRITE: 5975 case EXIT_REASON_EOI_INDUCED: 5976 /* 5977 * The controls for "virtualize APIC accesses," "APIC- 5978 * register virtualization," and "virtual-interrupt 5979 * delivery" only come from vmcs12. 5980 */ 5981 return true; 5982 case EXIT_REASON_INVPCID: 5983 return 5984 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5985 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5986 case EXIT_REASON_WBINVD: 5987 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5988 case EXIT_REASON_XSETBV: 5989 return true; 5990 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5991 /* 5992 * This should never happen, since it is not possible to 5993 * set XSS to a non-zero value---neither in L1 nor in L2. 5994 * If if it were, XSS would have to be checked against 5995 * the XSS exit bitmap in vmcs12. 5996 */ 5997 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5998 case EXIT_REASON_UMWAIT: 5999 case EXIT_REASON_TPAUSE: 6000 return nested_cpu_has2(vmcs12, 6001 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6002 case EXIT_REASON_ENCLS: 6003 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6004 default: 6005 return true; 6006 } 6007 } 6008 6009 /* 6010 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6011 * reflected into L1. 6012 */ 6013 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6014 { 6015 struct vcpu_vmx *vmx = to_vmx(vcpu); 6016 union vmx_exit_reason exit_reason = vmx->exit_reason; 6017 unsigned long exit_qual; 6018 u32 exit_intr_info; 6019 6020 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6021 6022 /* 6023 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6024 * has already loaded L2's state. 6025 */ 6026 if (unlikely(vmx->fail)) { 6027 trace_kvm_nested_vmenter_failed( 6028 "hardware VM-instruction error: ", 6029 vmcs_read32(VM_INSTRUCTION_ERROR)); 6030 exit_intr_info = 0; 6031 exit_qual = 0; 6032 goto reflect_vmexit; 6033 } 6034 6035 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6036 6037 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6038 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6039 return false; 6040 6041 /* If L1 doesn't want the exit, handle it in L0. */ 6042 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6043 return false; 6044 6045 /* 6046 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6047 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6048 * need to be synthesized by querying the in-kernel LAPIC, but external 6049 * interrupts are never reflected to L1 so it's a non-issue. 6050 */ 6051 exit_intr_info = vmx_get_intr_info(vcpu); 6052 if (is_exception_with_error_code(exit_intr_info)) { 6053 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6054 6055 vmcs12->vm_exit_intr_error_code = 6056 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6057 } 6058 exit_qual = vmx_get_exit_qual(vcpu); 6059 6060 reflect_vmexit: 6061 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6062 return true; 6063 } 6064 6065 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6066 struct kvm_nested_state __user *user_kvm_nested_state, 6067 u32 user_data_size) 6068 { 6069 struct vcpu_vmx *vmx; 6070 struct vmcs12 *vmcs12; 6071 struct kvm_nested_state kvm_state = { 6072 .flags = 0, 6073 .format = KVM_STATE_NESTED_FORMAT_VMX, 6074 .size = sizeof(kvm_state), 6075 .hdr.vmx.flags = 0, 6076 .hdr.vmx.vmxon_pa = INVALID_GPA, 6077 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6078 .hdr.vmx.preemption_timer_deadline = 0, 6079 }; 6080 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6081 &user_kvm_nested_state->data.vmx[0]; 6082 6083 if (!vcpu) 6084 return kvm_state.size + sizeof(*user_vmx_nested_state); 6085 6086 vmx = to_vmx(vcpu); 6087 vmcs12 = get_vmcs12(vcpu); 6088 6089 if (nested_vmx_allowed(vcpu) && 6090 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6091 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6092 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6093 6094 if (vmx_has_valid_vmcs12(vcpu)) { 6095 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6096 6097 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6098 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 6099 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6100 6101 if (is_guest_mode(vcpu) && 6102 nested_cpu_has_shadow_vmcs(vmcs12) && 6103 vmcs12->vmcs_link_pointer != INVALID_GPA) 6104 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6105 } 6106 6107 if (vmx->nested.smm.vmxon) 6108 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6109 6110 if (vmx->nested.smm.guest_mode) 6111 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6112 6113 if (is_guest_mode(vcpu)) { 6114 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6115 6116 if (vmx->nested.nested_run_pending) 6117 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6118 6119 if (vmx->nested.mtf_pending) 6120 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6121 6122 if (nested_cpu_has_preemption_timer(vmcs12) && 6123 vmx->nested.has_preemption_timer_deadline) { 6124 kvm_state.hdr.vmx.flags |= 6125 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6126 kvm_state.hdr.vmx.preemption_timer_deadline = 6127 vmx->nested.preemption_timer_deadline; 6128 } 6129 } 6130 } 6131 6132 if (user_data_size < kvm_state.size) 6133 goto out; 6134 6135 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6136 return -EFAULT; 6137 6138 if (!vmx_has_valid_vmcs12(vcpu)) 6139 goto out; 6140 6141 /* 6142 * When running L2, the authoritative vmcs12 state is in the 6143 * vmcs02. When running L1, the authoritative vmcs12 state is 6144 * in the shadow or enlightened vmcs linked to vmcs01, unless 6145 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6146 * vmcs12 state is in the vmcs12 already. 6147 */ 6148 if (is_guest_mode(vcpu)) { 6149 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6150 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6151 } else { 6152 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6153 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6154 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 6155 /* 6156 * L1 hypervisor is not obliged to keep eVMCS 6157 * clean fields data always up-to-date while 6158 * not in guest mode, 'hv_clean_fields' is only 6159 * supposed to be actual upon vmentry so we need 6160 * to ignore it here and do full copy. 6161 */ 6162 copy_enlightened_to_vmcs12(vmx, 0); 6163 else if (enable_shadow_vmcs) 6164 copy_shadow_to_vmcs12(vmx); 6165 } 6166 } 6167 6168 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6169 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6170 6171 /* 6172 * Copy over the full allocated size of vmcs12 rather than just the size 6173 * of the struct. 6174 */ 6175 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6176 return -EFAULT; 6177 6178 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6179 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6180 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6181 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6182 return -EFAULT; 6183 } 6184 out: 6185 return kvm_state.size; 6186 } 6187 6188 /* 6189 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 6190 */ 6191 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6192 { 6193 if (is_guest_mode(vcpu)) { 6194 to_vmx(vcpu)->nested.nested_run_pending = 0; 6195 nested_vmx_vmexit(vcpu, -1, 0, 0); 6196 } 6197 free_nested(vcpu); 6198 } 6199 6200 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6201 struct kvm_nested_state __user *user_kvm_nested_state, 6202 struct kvm_nested_state *kvm_state) 6203 { 6204 struct vcpu_vmx *vmx = to_vmx(vcpu); 6205 struct vmcs12 *vmcs12; 6206 enum vm_entry_failure_code ignored; 6207 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6208 &user_kvm_nested_state->data.vmx[0]; 6209 int ret; 6210 6211 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6212 return -EINVAL; 6213 6214 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6215 if (kvm_state->hdr.vmx.smm.flags) 6216 return -EINVAL; 6217 6218 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6219 return -EINVAL; 6220 6221 /* 6222 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6223 * enable eVMCS capability on vCPU. However, since then 6224 * code was changed such that flag signals vmcs12 should 6225 * be copied into eVMCS in guest memory. 6226 * 6227 * To preserve backwards compatability, allow user 6228 * to set this flag even when there is no VMXON region. 6229 */ 6230 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6231 return -EINVAL; 6232 } else { 6233 if (!nested_vmx_allowed(vcpu)) 6234 return -EINVAL; 6235 6236 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6237 return -EINVAL; 6238 } 6239 6240 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6241 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6242 return -EINVAL; 6243 6244 if (kvm_state->hdr.vmx.smm.flags & 6245 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6246 return -EINVAL; 6247 6248 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6249 return -EINVAL; 6250 6251 /* 6252 * SMM temporarily disables VMX, so we cannot be in guest mode, 6253 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6254 * must be zero. 6255 */ 6256 if (is_smm(vcpu) ? 6257 (kvm_state->flags & 6258 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6259 : kvm_state->hdr.vmx.smm.flags) 6260 return -EINVAL; 6261 6262 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6263 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6264 return -EINVAL; 6265 6266 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6267 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6268 return -EINVAL; 6269 6270 vmx_leave_nested(vcpu); 6271 6272 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6273 return 0; 6274 6275 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6276 ret = enter_vmx_operation(vcpu); 6277 if (ret) 6278 return ret; 6279 6280 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6281 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6282 /* See vmx_has_valid_vmcs12. */ 6283 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6284 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6285 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6286 return -EINVAL; 6287 else 6288 return 0; 6289 } 6290 6291 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6292 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6293 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6294 return -EINVAL; 6295 6296 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6297 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6298 /* 6299 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6300 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6301 * restored yet. EVMCS will be mapped from 6302 * nested_get_vmcs12_pages(). 6303 */ 6304 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6305 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6306 } else { 6307 return -EINVAL; 6308 } 6309 6310 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6311 vmx->nested.smm.vmxon = true; 6312 vmx->nested.vmxon = false; 6313 6314 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6315 vmx->nested.smm.guest_mode = true; 6316 } 6317 6318 vmcs12 = get_vmcs12(vcpu); 6319 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6320 return -EFAULT; 6321 6322 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6323 return -EINVAL; 6324 6325 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6326 return 0; 6327 6328 vmx->nested.nested_run_pending = 6329 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6330 6331 vmx->nested.mtf_pending = 6332 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6333 6334 ret = -EINVAL; 6335 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6336 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6337 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6338 6339 if (kvm_state->size < 6340 sizeof(*kvm_state) + 6341 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6342 goto error_guest_mode; 6343 6344 if (copy_from_user(shadow_vmcs12, 6345 user_vmx_nested_state->shadow_vmcs12, 6346 sizeof(*shadow_vmcs12))) { 6347 ret = -EFAULT; 6348 goto error_guest_mode; 6349 } 6350 6351 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6352 !shadow_vmcs12->hdr.shadow_vmcs) 6353 goto error_guest_mode; 6354 } 6355 6356 vmx->nested.has_preemption_timer_deadline = false; 6357 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6358 vmx->nested.has_preemption_timer_deadline = true; 6359 vmx->nested.preemption_timer_deadline = 6360 kvm_state->hdr.vmx.preemption_timer_deadline; 6361 } 6362 6363 if (nested_vmx_check_controls(vcpu, vmcs12) || 6364 nested_vmx_check_host_state(vcpu, vmcs12) || 6365 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6366 goto error_guest_mode; 6367 6368 vmx->nested.dirty_vmcs12 = true; 6369 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6370 if (ret) 6371 goto error_guest_mode; 6372 6373 return 0; 6374 6375 error_guest_mode: 6376 vmx->nested.nested_run_pending = 0; 6377 return ret; 6378 } 6379 6380 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6381 { 6382 if (enable_shadow_vmcs) { 6383 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6384 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6385 } 6386 } 6387 6388 /* 6389 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6390 * that madness to get the encoding for comparison. 6391 */ 6392 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6393 6394 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6395 { 6396 /* 6397 * Note these are the so called "index" of the VMCS field encoding, not 6398 * the index into vmcs12. 6399 */ 6400 unsigned int max_idx, idx; 6401 int i; 6402 6403 /* 6404 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6405 * vmcs12, regardless of whether or not the associated feature is 6406 * exposed to L1. Simply find the field with the highest index. 6407 */ 6408 max_idx = 0; 6409 for (i = 0; i < nr_vmcs12_fields; i++) { 6410 /* The vmcs12 table is very, very sparsely populated. */ 6411 if (!vmcs_field_to_offset_table[i]) 6412 continue; 6413 6414 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6415 if (idx > max_idx) 6416 max_idx = idx; 6417 } 6418 6419 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6420 } 6421 6422 /* 6423 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6424 * returned for the various VMX controls MSRs when nested VMX is enabled. 6425 * The same values should also be used to verify that vmcs12 control fields are 6426 * valid during nested entry from L1 to L2. 6427 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6428 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6429 * bit in the high half is on if the corresponding bit in the control field 6430 * may be on. See also vmx_control_verify(). 6431 */ 6432 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) 6433 { 6434 /* 6435 * Note that as a general rule, the high half of the MSRs (bits in 6436 * the control fields which may be 1) should be initialized by the 6437 * intersection of the underlying hardware's MSR (i.e., features which 6438 * can be supported) and the list of features we want to expose - 6439 * because they are known to be properly supported in our code. 6440 * Also, usually, the low half of the MSRs (bits which must be 1) can 6441 * be set to 0, meaning that L1 may turn off any of these bits. The 6442 * reason is that if one of these bits is necessary, it will appear 6443 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6444 * fields of vmcs01 and vmcs02, will turn these bits off - and 6445 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6446 * These rules have exceptions below. 6447 */ 6448 6449 /* pin-based controls */ 6450 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 6451 msrs->pinbased_ctls_low, 6452 msrs->pinbased_ctls_high); 6453 msrs->pinbased_ctls_low |= 6454 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6455 msrs->pinbased_ctls_high &= 6456 PIN_BASED_EXT_INTR_MASK | 6457 PIN_BASED_NMI_EXITING | 6458 PIN_BASED_VIRTUAL_NMIS | 6459 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6460 msrs->pinbased_ctls_high |= 6461 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6462 PIN_BASED_VMX_PREEMPTION_TIMER; 6463 6464 /* exit controls */ 6465 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 6466 msrs->exit_ctls_low, 6467 msrs->exit_ctls_high); 6468 msrs->exit_ctls_low = 6469 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6470 6471 msrs->exit_ctls_high &= 6472 #ifdef CONFIG_X86_64 6473 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6474 #endif 6475 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6476 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6477 msrs->exit_ctls_high |= 6478 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6479 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6480 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 6481 6482 /* We support free control of debug control saving. */ 6483 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6484 6485 /* entry controls */ 6486 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 6487 msrs->entry_ctls_low, 6488 msrs->entry_ctls_high); 6489 msrs->entry_ctls_low = 6490 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6491 msrs->entry_ctls_high &= 6492 #ifdef CONFIG_X86_64 6493 VM_ENTRY_IA32E_MODE | 6494 #endif 6495 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 6496 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 6497 msrs->entry_ctls_high |= 6498 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6499 6500 /* We support free control of debug control loading. */ 6501 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6502 6503 /* cpu-based controls */ 6504 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6505 msrs->procbased_ctls_low, 6506 msrs->procbased_ctls_high); 6507 msrs->procbased_ctls_low = 6508 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6509 msrs->procbased_ctls_high &= 6510 CPU_BASED_INTR_WINDOW_EXITING | 6511 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6512 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6513 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6514 CPU_BASED_CR3_STORE_EXITING | 6515 #ifdef CONFIG_X86_64 6516 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6517 #endif 6518 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6519 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6520 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6521 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6522 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6523 /* 6524 * We can allow some features even when not supported by the 6525 * hardware. For example, L1 can specify an MSR bitmap - and we 6526 * can use it to avoid exits to L1 - even when L0 runs L2 6527 * without MSR bitmaps. 6528 */ 6529 msrs->procbased_ctls_high |= 6530 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6531 CPU_BASED_USE_MSR_BITMAPS; 6532 6533 /* We support free control of CR3 access interception. */ 6534 msrs->procbased_ctls_low &= 6535 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6536 6537 /* 6538 * secondary cpu-based controls. Do not include those that 6539 * depend on CPUID bits, they are added later by 6540 * vmx_vcpu_after_set_cpuid. 6541 */ 6542 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6543 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6544 msrs->secondary_ctls_low, 6545 msrs->secondary_ctls_high); 6546 6547 msrs->secondary_ctls_low = 0; 6548 msrs->secondary_ctls_high &= 6549 SECONDARY_EXEC_DESC | 6550 SECONDARY_EXEC_ENABLE_RDTSCP | 6551 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6552 SECONDARY_EXEC_WBINVD_EXITING | 6553 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6554 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6555 SECONDARY_EXEC_RDRAND_EXITING | 6556 SECONDARY_EXEC_ENABLE_INVPCID | 6557 SECONDARY_EXEC_RDSEED_EXITING | 6558 SECONDARY_EXEC_XSAVES | 6559 SECONDARY_EXEC_TSC_SCALING; 6560 6561 /* 6562 * We can emulate "VMCS shadowing," even if the hardware 6563 * doesn't support it. 6564 */ 6565 msrs->secondary_ctls_high |= 6566 SECONDARY_EXEC_SHADOW_VMCS; 6567 6568 if (enable_ept) { 6569 /* nested EPT: emulate EPT also to L1 */ 6570 msrs->secondary_ctls_high |= 6571 SECONDARY_EXEC_ENABLE_EPT; 6572 msrs->ept_caps = 6573 VMX_EPT_PAGE_WALK_4_BIT | 6574 VMX_EPT_PAGE_WALK_5_BIT | 6575 VMX_EPTP_WB_BIT | 6576 VMX_EPT_INVEPT_BIT | 6577 VMX_EPT_EXECUTE_ONLY_BIT; 6578 6579 msrs->ept_caps &= ept_caps; 6580 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6581 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6582 VMX_EPT_1GB_PAGE_BIT; 6583 if (enable_ept_ad_bits) { 6584 msrs->secondary_ctls_high |= 6585 SECONDARY_EXEC_ENABLE_PML; 6586 msrs->ept_caps |= VMX_EPT_AD_BIT; 6587 } 6588 } 6589 6590 if (cpu_has_vmx_vmfunc()) { 6591 msrs->secondary_ctls_high |= 6592 SECONDARY_EXEC_ENABLE_VMFUNC; 6593 /* 6594 * Advertise EPTP switching unconditionally 6595 * since we emulate it 6596 */ 6597 if (enable_ept) 6598 msrs->vmfunc_controls = 6599 VMX_VMFUNC_EPTP_SWITCHING; 6600 } 6601 6602 /* 6603 * Old versions of KVM use the single-context version without 6604 * checking for support, so declare that it is supported even 6605 * though it is treated as global context. The alternative is 6606 * not failing the single-context invvpid, and it is worse. 6607 */ 6608 if (enable_vpid) { 6609 msrs->secondary_ctls_high |= 6610 SECONDARY_EXEC_ENABLE_VPID; 6611 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6612 VMX_VPID_EXTENT_SUPPORTED_MASK; 6613 } 6614 6615 if (enable_unrestricted_guest) 6616 msrs->secondary_ctls_high |= 6617 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6618 6619 if (flexpriority_enabled) 6620 msrs->secondary_ctls_high |= 6621 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6622 6623 if (enable_sgx) 6624 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6625 6626 /* miscellaneous data */ 6627 rdmsr(MSR_IA32_VMX_MISC, 6628 msrs->misc_low, 6629 msrs->misc_high); 6630 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6631 msrs->misc_low |= 6632 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6633 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6634 VMX_MISC_ACTIVITY_HLT | 6635 VMX_MISC_ACTIVITY_WAIT_SIPI; 6636 msrs->misc_high = 0; 6637 6638 /* 6639 * This MSR reports some information about VMX support. We 6640 * should return information about the VMX we emulate for the 6641 * guest, and the VMCS structure we give it - not about the 6642 * VMX support of the underlying hardware. 6643 */ 6644 msrs->basic = 6645 VMCS12_REVISION | 6646 VMX_BASIC_TRUE_CTLS | 6647 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6648 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6649 6650 if (cpu_has_vmx_basic_inout()) 6651 msrs->basic |= VMX_BASIC_INOUT; 6652 6653 /* 6654 * These MSRs specify bits which the guest must keep fixed on 6655 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6656 * We picked the standard core2 setting. 6657 */ 6658 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6659 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6660 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6661 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6662 6663 /* These MSRs specify bits which the guest must keep fixed off. */ 6664 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6665 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6666 6667 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 6668 } 6669 6670 void nested_vmx_hardware_unsetup(void) 6671 { 6672 int i; 6673 6674 if (enable_shadow_vmcs) { 6675 for (i = 0; i < VMX_BITMAP_NR; i++) 6676 free_page((unsigned long)vmx_bitmap[i]); 6677 } 6678 } 6679 6680 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6681 { 6682 int i; 6683 6684 if (!cpu_has_vmx_shadow_vmcs()) 6685 enable_shadow_vmcs = 0; 6686 if (enable_shadow_vmcs) { 6687 for (i = 0; i < VMX_BITMAP_NR; i++) { 6688 /* 6689 * The vmx_bitmap is not tied to a VM and so should 6690 * not be charged to a memcg. 6691 */ 6692 vmx_bitmap[i] = (unsigned long *) 6693 __get_free_page(GFP_KERNEL); 6694 if (!vmx_bitmap[i]) { 6695 nested_vmx_hardware_unsetup(); 6696 return -ENOMEM; 6697 } 6698 } 6699 6700 init_vmcs_shadow_fields(); 6701 } 6702 6703 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6704 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6705 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6706 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6707 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6708 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6709 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6710 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; 6711 exit_handlers[EXIT_REASON_VMON] = handle_vmon; 6712 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6713 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6714 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6715 6716 return 0; 6717 } 6718 6719 struct kvm_x86_nested_ops vmx_nested_ops = { 6720 .check_events = vmx_check_nested_events, 6721 .hv_timer_pending = nested_vmx_preemption_timer_pending, 6722 .triple_fault = nested_vmx_triple_fault, 6723 .get_state = vmx_get_nested_state, 6724 .set_state = vmx_set_nested_state, 6725 .get_nested_state_pages = vmx_get_nested_state_pages, 6726 .write_log_dirty = nested_vmx_write_pml_buffer, 6727 .enable_evmcs = nested_enable_evmcs, 6728 .get_evmcs_version = nested_get_evmcs_version, 6729 }; 6730