1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/objtool.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "pmu.h" 14 #include "sgx.h" 15 #include "trace.h" 16 #include "vmx.h" 17 #include "x86.h" 18 19 static bool __read_mostly enable_shadow_vmcs = 1; 20 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 21 22 static bool __read_mostly nested_early_check = 0; 23 module_param(nested_early_check, bool, S_IRUGO); 24 25 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 26 27 /* 28 * Hyper-V requires all of these, so mark them as supported even though 29 * they are just treated the same as all-context. 30 */ 31 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 32 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 33 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 34 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 36 37 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 38 39 enum { 40 VMX_VMREAD_BITMAP, 41 VMX_VMWRITE_BITMAP, 42 VMX_BITMAP_NR 43 }; 44 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 45 46 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 47 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 48 49 struct shadow_vmcs_field { 50 u16 encoding; 51 u16 offset; 52 }; 53 static struct shadow_vmcs_field shadow_read_only_fields[] = { 54 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 55 #include "vmcs_shadow_fields.h" 56 }; 57 static int max_shadow_read_only_fields = 58 ARRAY_SIZE(shadow_read_only_fields); 59 60 static struct shadow_vmcs_field shadow_read_write_fields[] = { 61 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 62 #include "vmcs_shadow_fields.h" 63 }; 64 static int max_shadow_read_write_fields = 65 ARRAY_SIZE(shadow_read_write_fields); 66 67 static void init_vmcs_shadow_fields(void) 68 { 69 int i, j; 70 71 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 72 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 73 74 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 75 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 76 u16 field = entry.encoding; 77 78 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 79 (i + 1 == max_shadow_read_only_fields || 80 shadow_read_only_fields[i + 1].encoding != field + 1)) 81 pr_err("Missing field from shadow_read_only_field %x\n", 82 field + 1); 83 84 clear_bit(field, vmx_vmread_bitmap); 85 if (field & 1) 86 #ifdef CONFIG_X86_64 87 continue; 88 #else 89 entry.offset += sizeof(u32); 90 #endif 91 shadow_read_only_fields[j++] = entry; 92 } 93 max_shadow_read_only_fields = j; 94 95 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 96 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 97 u16 field = entry.encoding; 98 99 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 100 (i + 1 == max_shadow_read_write_fields || 101 shadow_read_write_fields[i + 1].encoding != field + 1)) 102 pr_err("Missing field from shadow_read_write_field %x\n", 103 field + 1); 104 105 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 106 field <= GUEST_TR_AR_BYTES, 107 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 108 109 /* 110 * PML and the preemption timer can be emulated, but the 111 * processor cannot vmwrite to fields that don't exist 112 * on bare metal. 113 */ 114 switch (field) { 115 case GUEST_PML_INDEX: 116 if (!cpu_has_vmx_pml()) 117 continue; 118 break; 119 case VMX_PREEMPTION_TIMER_VALUE: 120 if (!cpu_has_vmx_preemption_timer()) 121 continue; 122 break; 123 case GUEST_INTR_STATUS: 124 if (!cpu_has_vmx_apicv()) 125 continue; 126 break; 127 default: 128 break; 129 } 130 131 clear_bit(field, vmx_vmwrite_bitmap); 132 clear_bit(field, vmx_vmread_bitmap); 133 if (field & 1) 134 #ifdef CONFIG_X86_64 135 continue; 136 #else 137 entry.offset += sizeof(u32); 138 #endif 139 shadow_read_write_fields[j++] = entry; 140 } 141 max_shadow_read_write_fields = j; 142 } 143 144 /* 145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 146 * set the success or error code of an emulated VMX instruction (as specified 147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 148 * instruction. 149 */ 150 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 151 { 152 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 153 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 154 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 155 return kvm_skip_emulated_instruction(vcpu); 156 } 157 158 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 162 X86_EFLAGS_SF | X86_EFLAGS_OF)) 163 | X86_EFLAGS_CF); 164 return kvm_skip_emulated_instruction(vcpu); 165 } 166 167 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 168 u32 vm_instruction_error) 169 { 170 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 171 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 172 X86_EFLAGS_SF | X86_EFLAGS_OF)) 173 | X86_EFLAGS_ZF); 174 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 175 /* 176 * We don't need to force sync to shadow VMCS because 177 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 178 * fields and thus must be synced. 179 */ 180 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 181 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 182 183 return kvm_skip_emulated_instruction(vcpu); 184 } 185 186 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 187 { 188 struct vcpu_vmx *vmx = to_vmx(vcpu); 189 190 /* 191 * failValid writes the error number to the current VMCS, which 192 * can't be done if there isn't a current VMCS. 193 */ 194 if (vmx->nested.current_vmptr == -1ull && 195 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 196 return nested_vmx_failInvalid(vcpu); 197 198 return nested_vmx_failValid(vcpu, vm_instruction_error); 199 } 200 201 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 202 { 203 /* TODO: not to reset guest simply here. */ 204 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 205 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 206 } 207 208 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 209 { 210 return fixed_bits_valid(control, low, high); 211 } 212 213 static inline u64 vmx_control_msr(u32 low, u32 high) 214 { 215 return low | ((u64)high << 32); 216 } 217 218 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 219 { 220 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 221 vmcs_write64(VMCS_LINK_POINTER, -1ull); 222 vmx->nested.need_vmcs12_to_shadow_sync = false; 223 } 224 225 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 226 { 227 struct vcpu_vmx *vmx = to_vmx(vcpu); 228 229 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 230 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 231 vmx->nested.hv_evmcs = NULL; 232 } 233 234 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 235 } 236 237 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 238 struct loaded_vmcs *prev) 239 { 240 struct vmcs_host_state *dest, *src; 241 242 if (unlikely(!vmx->guest_state_loaded)) 243 return; 244 245 src = &prev->host_state; 246 dest = &vmx->loaded_vmcs->host_state; 247 248 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 249 dest->ldt_sel = src->ldt_sel; 250 #ifdef CONFIG_X86_64 251 dest->ds_sel = src->ds_sel; 252 dest->es_sel = src->es_sel; 253 #endif 254 } 255 256 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 257 { 258 struct vcpu_vmx *vmx = to_vmx(vcpu); 259 struct loaded_vmcs *prev; 260 int cpu; 261 262 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 263 return; 264 265 cpu = get_cpu(); 266 prev = vmx->loaded_vmcs; 267 vmx->loaded_vmcs = vmcs; 268 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 269 vmx_sync_vmcs_host_state(vmx, prev); 270 put_cpu(); 271 272 vmx_register_cache_reset(vcpu); 273 } 274 275 /* 276 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 277 * just stops using VMX. 278 */ 279 static void free_nested(struct kvm_vcpu *vcpu) 280 { 281 struct vcpu_vmx *vmx = to_vmx(vcpu); 282 283 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 284 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 285 286 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 287 return; 288 289 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 290 291 vmx->nested.vmxon = false; 292 vmx->nested.smm.vmxon = false; 293 free_vpid(vmx->nested.vpid02); 294 vmx->nested.posted_intr_nv = -1; 295 vmx->nested.current_vmptr = -1ull; 296 if (enable_shadow_vmcs) { 297 vmx_disable_shadow_vmcs(vmx); 298 vmcs_clear(vmx->vmcs01.shadow_vmcs); 299 free_vmcs(vmx->vmcs01.shadow_vmcs); 300 vmx->vmcs01.shadow_vmcs = NULL; 301 } 302 kfree(vmx->nested.cached_vmcs12); 303 vmx->nested.cached_vmcs12 = NULL; 304 kfree(vmx->nested.cached_shadow_vmcs12); 305 vmx->nested.cached_shadow_vmcs12 = NULL; 306 /* Unpin physical memory we referred to in the vmcs02 */ 307 if (vmx->nested.apic_access_page) { 308 kvm_release_page_clean(vmx->nested.apic_access_page); 309 vmx->nested.apic_access_page = NULL; 310 } 311 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 312 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 313 vmx->nested.pi_desc = NULL; 314 315 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 316 317 nested_release_evmcs(vcpu); 318 319 free_loaded_vmcs(&vmx->nested.vmcs02); 320 } 321 322 /* 323 * Ensure that the current vmcs of the logical processor is the 324 * vmcs01 of the vcpu before calling free_nested(). 325 */ 326 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 327 { 328 vcpu_load(vcpu); 329 vmx_leave_nested(vcpu); 330 vcpu_put(vcpu); 331 } 332 333 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 334 struct x86_exception *fault) 335 { 336 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 337 struct vcpu_vmx *vmx = to_vmx(vcpu); 338 u32 vm_exit_reason; 339 unsigned long exit_qualification = vcpu->arch.exit_qualification; 340 341 if (vmx->nested.pml_full) { 342 vm_exit_reason = EXIT_REASON_PML_FULL; 343 vmx->nested.pml_full = false; 344 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 345 } else if (fault->error_code & PFERR_RSVD_MASK) 346 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 347 else 348 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 349 350 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 351 vmcs12->guest_physical_address = fault->address; 352 } 353 354 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 355 { 356 kvm_init_shadow_ept_mmu(vcpu, 357 to_vmx(vcpu)->nested.msrs.ept_caps & 358 VMX_EPT_EXECUTE_ONLY_BIT, 359 nested_ept_ad_enabled(vcpu), 360 nested_ept_get_eptp(vcpu)); 361 } 362 363 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 364 { 365 WARN_ON(mmu_is_nested(vcpu)); 366 367 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 368 nested_ept_new_eptp(vcpu); 369 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 370 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 371 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 372 373 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 374 } 375 376 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 377 { 378 vcpu->arch.mmu = &vcpu->arch.root_mmu; 379 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 380 } 381 382 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 383 u16 error_code) 384 { 385 bool inequality, bit; 386 387 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 388 inequality = 389 (error_code & vmcs12->page_fault_error_code_mask) != 390 vmcs12->page_fault_error_code_match; 391 return inequality ^ bit; 392 } 393 394 395 /* 396 * KVM wants to inject page-faults which it got to the guest. This function 397 * checks whether in a nested guest, we need to inject them to L1 or L2. 398 */ 399 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 400 { 401 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 402 unsigned int nr = vcpu->arch.exception.nr; 403 bool has_payload = vcpu->arch.exception.has_payload; 404 unsigned long payload = vcpu->arch.exception.payload; 405 406 if (nr == PF_VECTOR) { 407 if (vcpu->arch.exception.nested_apf) { 408 *exit_qual = vcpu->arch.apf.nested_apf_token; 409 return 1; 410 } 411 if (nested_vmx_is_page_fault_vmexit(vmcs12, 412 vcpu->arch.exception.error_code)) { 413 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 414 return 1; 415 } 416 } else if (vmcs12->exception_bitmap & (1u << nr)) { 417 if (nr == DB_VECTOR) { 418 if (!has_payload) { 419 payload = vcpu->arch.dr6; 420 payload &= ~DR6_BT; 421 payload ^= DR6_ACTIVE_LOW; 422 } 423 *exit_qual = payload; 424 } else 425 *exit_qual = 0; 426 return 1; 427 } 428 429 return 0; 430 } 431 432 433 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 434 struct x86_exception *fault) 435 { 436 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 437 438 WARN_ON(!is_guest_mode(vcpu)); 439 440 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 441 !to_vmx(vcpu)->nested.nested_run_pending) { 442 vmcs12->vm_exit_intr_error_code = fault->error_code; 443 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 444 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 445 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 446 fault->address); 447 } else { 448 kvm_inject_page_fault(vcpu, fault); 449 } 450 } 451 452 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 453 struct vmcs12 *vmcs12) 454 { 455 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 456 return 0; 457 458 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 459 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 460 return -EINVAL; 461 462 return 0; 463 } 464 465 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 466 struct vmcs12 *vmcs12) 467 { 468 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 469 return 0; 470 471 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 472 return -EINVAL; 473 474 return 0; 475 } 476 477 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 478 struct vmcs12 *vmcs12) 479 { 480 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 481 return 0; 482 483 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 484 return -EINVAL; 485 486 return 0; 487 } 488 489 /* 490 * Check if MSR is intercepted for L01 MSR bitmap. 491 */ 492 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) 493 { 494 unsigned long *msr_bitmap; 495 int f = sizeof(unsigned long); 496 497 if (!cpu_has_vmx_msr_bitmap()) 498 return true; 499 500 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 501 502 if (msr <= 0x1fff) { 503 return !!test_bit(msr, msr_bitmap + 0x800 / f); 504 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 505 msr &= 0x1fff; 506 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 507 } 508 509 return true; 510 } 511 512 /* 513 * If a msr is allowed by L0, we should check whether it is allowed by L1. 514 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 515 */ 516 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 517 unsigned long *msr_bitmap_nested, 518 u32 msr, int type) 519 { 520 int f = sizeof(unsigned long); 521 522 /* 523 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 524 * have the write-low and read-high bitmap offsets the wrong way round. 525 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 526 */ 527 if (msr <= 0x1fff) { 528 if (type & MSR_TYPE_R && 529 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 530 /* read-low */ 531 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 532 533 if (type & MSR_TYPE_W && 534 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 535 /* write-low */ 536 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 537 538 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 539 msr &= 0x1fff; 540 if (type & MSR_TYPE_R && 541 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 542 /* read-high */ 543 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 544 545 if (type & MSR_TYPE_W && 546 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 547 /* write-high */ 548 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 549 550 } 551 } 552 553 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 554 { 555 int msr; 556 557 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 558 unsigned word = msr / BITS_PER_LONG; 559 560 msr_bitmap[word] = ~0; 561 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 562 } 563 } 564 565 /* 566 * Merge L0's and L1's MSR bitmap, return false to indicate that 567 * we do not use the hardware. 568 */ 569 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 570 struct vmcs12 *vmcs12) 571 { 572 int msr; 573 unsigned long *msr_bitmap_l1; 574 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 575 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; 576 577 /* Nothing to do if the MSR bitmap is not in use. */ 578 if (!cpu_has_vmx_msr_bitmap() || 579 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 580 return false; 581 582 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 583 return false; 584 585 msr_bitmap_l1 = (unsigned long *)map->hva; 586 587 /* 588 * To keep the control flow simple, pay eight 8-byte writes (sixteen 589 * 4-byte writes on 32-bit systems) up front to enable intercepts for 590 * the x2APIC MSR range and selectively disable them below. 591 */ 592 enable_x2apic_msr_intercepts(msr_bitmap_l0); 593 594 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 595 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 596 /* 597 * L0 need not intercept reads for MSRs between 0x800 598 * and 0x8ff, it just lets the processor take the value 599 * from the virtual-APIC page; take those 256 bits 600 * directly from the L1 bitmap. 601 */ 602 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 603 unsigned word = msr / BITS_PER_LONG; 604 605 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 606 } 607 } 608 609 nested_vmx_disable_intercept_for_msr( 610 msr_bitmap_l1, msr_bitmap_l0, 611 X2APIC_MSR(APIC_TASKPRI), 612 MSR_TYPE_R | MSR_TYPE_W); 613 614 if (nested_cpu_has_vid(vmcs12)) { 615 nested_vmx_disable_intercept_for_msr( 616 msr_bitmap_l1, msr_bitmap_l0, 617 X2APIC_MSR(APIC_EOI), 618 MSR_TYPE_W); 619 nested_vmx_disable_intercept_for_msr( 620 msr_bitmap_l1, msr_bitmap_l0, 621 X2APIC_MSR(APIC_SELF_IPI), 622 MSR_TYPE_W); 623 } 624 } 625 626 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ 627 #ifdef CONFIG_X86_64 628 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 629 MSR_FS_BASE, MSR_TYPE_RW); 630 631 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 632 MSR_GS_BASE, MSR_TYPE_RW); 633 634 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 635 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 636 #endif 637 638 /* 639 * Checking the L0->L1 bitmap is trying to verify two things: 640 * 641 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This 642 * ensures that we do not accidentally generate an L02 MSR bitmap 643 * from the L12 MSR bitmap that is too permissive. 644 * 2. That L1 or L2s have actually used the MSR. This avoids 645 * unnecessarily merging of the bitmap if the MSR is unused. This 646 * works properly because we only update the L01 MSR bitmap lazily. 647 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only 648 * updated to reflect this when L1 (or its L2s) actually write to 649 * the MSR. 650 */ 651 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) 652 nested_vmx_disable_intercept_for_msr( 653 msr_bitmap_l1, msr_bitmap_l0, 654 MSR_IA32_SPEC_CTRL, 655 MSR_TYPE_R | MSR_TYPE_W); 656 657 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) 658 nested_vmx_disable_intercept_for_msr( 659 msr_bitmap_l1, msr_bitmap_l0, 660 MSR_IA32_PRED_CMD, 661 MSR_TYPE_W); 662 663 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); 664 665 return true; 666 } 667 668 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 669 struct vmcs12 *vmcs12) 670 { 671 struct kvm_host_map map; 672 struct vmcs12 *shadow; 673 674 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 675 vmcs12->vmcs_link_pointer == -1ull) 676 return; 677 678 shadow = get_shadow_vmcs12(vcpu); 679 680 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 681 return; 682 683 memcpy(shadow, map.hva, VMCS12_SIZE); 684 kvm_vcpu_unmap(vcpu, &map, false); 685 } 686 687 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 688 struct vmcs12 *vmcs12) 689 { 690 struct vcpu_vmx *vmx = to_vmx(vcpu); 691 692 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 693 vmcs12->vmcs_link_pointer == -1ull) 694 return; 695 696 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 697 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 698 } 699 700 /* 701 * In nested virtualization, check if L1 has set 702 * VM_EXIT_ACK_INTR_ON_EXIT 703 */ 704 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 705 { 706 return get_vmcs12(vcpu)->vm_exit_controls & 707 VM_EXIT_ACK_INTR_ON_EXIT; 708 } 709 710 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 711 struct vmcs12 *vmcs12) 712 { 713 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 714 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 715 return -EINVAL; 716 else 717 return 0; 718 } 719 720 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 721 struct vmcs12 *vmcs12) 722 { 723 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 724 !nested_cpu_has_apic_reg_virt(vmcs12) && 725 !nested_cpu_has_vid(vmcs12) && 726 !nested_cpu_has_posted_intr(vmcs12)) 727 return 0; 728 729 /* 730 * If virtualize x2apic mode is enabled, 731 * virtualize apic access must be disabled. 732 */ 733 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 734 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 735 return -EINVAL; 736 737 /* 738 * If virtual interrupt delivery is enabled, 739 * we must exit on external interrupts. 740 */ 741 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 742 return -EINVAL; 743 744 /* 745 * bits 15:8 should be zero in posted_intr_nv, 746 * the descriptor address has been already checked 747 * in nested_get_vmcs12_pages. 748 * 749 * bits 5:0 of posted_intr_desc_addr should be zero. 750 */ 751 if (nested_cpu_has_posted_intr(vmcs12) && 752 (CC(!nested_cpu_has_vid(vmcs12)) || 753 CC(!nested_exit_intr_ack_set(vcpu)) || 754 CC((vmcs12->posted_intr_nv & 0xff00)) || 755 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 756 return -EINVAL; 757 758 /* tpr shadow is needed by all apicv features. */ 759 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 760 return -EINVAL; 761 762 return 0; 763 } 764 765 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 766 u32 count, u64 addr) 767 { 768 if (count == 0) 769 return 0; 770 771 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 772 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 773 return -EINVAL; 774 775 return 0; 776 } 777 778 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 779 struct vmcs12 *vmcs12) 780 { 781 if (CC(nested_vmx_check_msr_switch(vcpu, 782 vmcs12->vm_exit_msr_load_count, 783 vmcs12->vm_exit_msr_load_addr)) || 784 CC(nested_vmx_check_msr_switch(vcpu, 785 vmcs12->vm_exit_msr_store_count, 786 vmcs12->vm_exit_msr_store_addr))) 787 return -EINVAL; 788 789 return 0; 790 } 791 792 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 793 struct vmcs12 *vmcs12) 794 { 795 if (CC(nested_vmx_check_msr_switch(vcpu, 796 vmcs12->vm_entry_msr_load_count, 797 vmcs12->vm_entry_msr_load_addr))) 798 return -EINVAL; 799 800 return 0; 801 } 802 803 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 804 struct vmcs12 *vmcs12) 805 { 806 if (!nested_cpu_has_pml(vmcs12)) 807 return 0; 808 809 if (CC(!nested_cpu_has_ept(vmcs12)) || 810 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 811 return -EINVAL; 812 813 return 0; 814 } 815 816 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 817 struct vmcs12 *vmcs12) 818 { 819 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 820 !nested_cpu_has_ept(vmcs12))) 821 return -EINVAL; 822 return 0; 823 } 824 825 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 826 struct vmcs12 *vmcs12) 827 { 828 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 829 !nested_cpu_has_ept(vmcs12))) 830 return -EINVAL; 831 return 0; 832 } 833 834 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 835 struct vmcs12 *vmcs12) 836 { 837 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 838 return 0; 839 840 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 841 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 842 return -EINVAL; 843 844 return 0; 845 } 846 847 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 848 struct vmx_msr_entry *e) 849 { 850 /* x2APIC MSR accesses are not allowed */ 851 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 852 return -EINVAL; 853 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 854 CC(e->index == MSR_IA32_UCODE_REV)) 855 return -EINVAL; 856 if (CC(e->reserved != 0)) 857 return -EINVAL; 858 return 0; 859 } 860 861 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 862 struct vmx_msr_entry *e) 863 { 864 if (CC(e->index == MSR_FS_BASE) || 865 CC(e->index == MSR_GS_BASE) || 866 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 867 nested_vmx_msr_check_common(vcpu, e)) 868 return -EINVAL; 869 return 0; 870 } 871 872 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 873 struct vmx_msr_entry *e) 874 { 875 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 876 nested_vmx_msr_check_common(vcpu, e)) 877 return -EINVAL; 878 return 0; 879 } 880 881 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 882 { 883 struct vcpu_vmx *vmx = to_vmx(vcpu); 884 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 885 vmx->nested.msrs.misc_high); 886 887 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 888 } 889 890 /* 891 * Load guest's/host's msr at nested entry/exit. 892 * return 0 for success, entry index for failure. 893 * 894 * One of the failure modes for MSR load/store is when a list exceeds the 895 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 896 * as possible, process all valid entries before failing rather than precheck 897 * for a capacity violation. 898 */ 899 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 900 { 901 u32 i; 902 struct vmx_msr_entry e; 903 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 904 905 for (i = 0; i < count; i++) { 906 if (unlikely(i >= max_msr_list_size)) 907 goto fail; 908 909 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 910 &e, sizeof(e))) { 911 pr_debug_ratelimited( 912 "%s cannot read MSR entry (%u, 0x%08llx)\n", 913 __func__, i, gpa + i * sizeof(e)); 914 goto fail; 915 } 916 if (nested_vmx_load_msr_check(vcpu, &e)) { 917 pr_debug_ratelimited( 918 "%s check failed (%u, 0x%x, 0x%x)\n", 919 __func__, i, e.index, e.reserved); 920 goto fail; 921 } 922 if (kvm_set_msr(vcpu, e.index, e.value)) { 923 pr_debug_ratelimited( 924 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 925 __func__, i, e.index, e.value); 926 goto fail; 927 } 928 } 929 return 0; 930 fail: 931 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 932 return i + 1; 933 } 934 935 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 936 u32 msr_index, 937 u64 *data) 938 { 939 struct vcpu_vmx *vmx = to_vmx(vcpu); 940 941 /* 942 * If the L0 hypervisor stored a more accurate value for the TSC that 943 * does not include the time taken for emulation of the L2->L1 944 * VM-exit in L0, use the more accurate value. 945 */ 946 if (msr_index == MSR_IA32_TSC) { 947 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 948 MSR_IA32_TSC); 949 950 if (i >= 0) { 951 u64 val = vmx->msr_autostore.guest.val[i].value; 952 953 *data = kvm_read_l1_tsc(vcpu, val); 954 return true; 955 } 956 } 957 958 if (kvm_get_msr(vcpu, msr_index, data)) { 959 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 960 msr_index); 961 return false; 962 } 963 return true; 964 } 965 966 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 967 struct vmx_msr_entry *e) 968 { 969 if (kvm_vcpu_read_guest(vcpu, 970 gpa + i * sizeof(*e), 971 e, 2 * sizeof(u32))) { 972 pr_debug_ratelimited( 973 "%s cannot read MSR entry (%u, 0x%08llx)\n", 974 __func__, i, gpa + i * sizeof(*e)); 975 return false; 976 } 977 if (nested_vmx_store_msr_check(vcpu, e)) { 978 pr_debug_ratelimited( 979 "%s check failed (%u, 0x%x, 0x%x)\n", 980 __func__, i, e->index, e->reserved); 981 return false; 982 } 983 return true; 984 } 985 986 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 987 { 988 u64 data; 989 u32 i; 990 struct vmx_msr_entry e; 991 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 992 993 for (i = 0; i < count; i++) { 994 if (unlikely(i >= max_msr_list_size)) 995 return -EINVAL; 996 997 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 998 return -EINVAL; 999 1000 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1001 return -EINVAL; 1002 1003 if (kvm_vcpu_write_guest(vcpu, 1004 gpa + i * sizeof(e) + 1005 offsetof(struct vmx_msr_entry, value), 1006 &data, sizeof(data))) { 1007 pr_debug_ratelimited( 1008 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1009 __func__, i, e.index, data); 1010 return -EINVAL; 1011 } 1012 } 1013 return 0; 1014 } 1015 1016 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1017 { 1018 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1019 u32 count = vmcs12->vm_exit_msr_store_count; 1020 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1021 struct vmx_msr_entry e; 1022 u32 i; 1023 1024 for (i = 0; i < count; i++) { 1025 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1026 return false; 1027 1028 if (e.index == msr_index) 1029 return true; 1030 } 1031 return false; 1032 } 1033 1034 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1035 u32 msr_index) 1036 { 1037 struct vcpu_vmx *vmx = to_vmx(vcpu); 1038 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1039 bool in_vmcs12_store_list; 1040 int msr_autostore_slot; 1041 bool in_autostore_list; 1042 int last; 1043 1044 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1045 in_autostore_list = msr_autostore_slot >= 0; 1046 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1047 1048 if (in_vmcs12_store_list && !in_autostore_list) { 1049 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1050 /* 1051 * Emulated VMEntry does not fail here. Instead a less 1052 * accurate value will be returned by 1053 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1054 * instead of reading the value from the vmcs02 VMExit 1055 * MSR-store area. 1056 */ 1057 pr_warn_ratelimited( 1058 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1059 msr_index); 1060 return; 1061 } 1062 last = autostore->nr++; 1063 autostore->val[last].index = msr_index; 1064 } else if (!in_vmcs12_store_list && in_autostore_list) { 1065 last = --autostore->nr; 1066 autostore->val[msr_autostore_slot] = autostore->val[last]; 1067 } 1068 } 1069 1070 /* 1071 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1072 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1073 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1074 * @entry_failure_code. 1075 */ 1076 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1077 bool nested_ept, bool reload_pdptrs, 1078 enum vm_entry_failure_code *entry_failure_code) 1079 { 1080 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1081 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1082 return -EINVAL; 1083 } 1084 1085 /* 1086 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1087 * must not be dereferenced. 1088 */ 1089 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1090 CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1091 *entry_failure_code = ENTRY_FAIL_PDPTE; 1092 return -EINVAL; 1093 } 1094 1095 if (!nested_ept) 1096 kvm_mmu_new_pgd(vcpu, cr3); 1097 1098 vcpu->arch.cr3 = cr3; 1099 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1100 1101 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1102 kvm_init_mmu(vcpu); 1103 1104 return 0; 1105 } 1106 1107 /* 1108 * Returns if KVM is able to config CPU to tag TLB entries 1109 * populated by L2 differently than TLB entries populated 1110 * by L1. 1111 * 1112 * If L0 uses EPT, L1 and L2 run with different EPTP because 1113 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1114 * are tagged with different EPTP. 1115 * 1116 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1117 * with different VPID (L1 entries are tagged with vmx->vpid 1118 * while L2 entries are tagged with vmx->nested.vpid02). 1119 */ 1120 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1121 { 1122 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1123 1124 return enable_ept || 1125 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1126 } 1127 1128 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1129 struct vmcs12 *vmcs12, 1130 bool is_vmenter) 1131 { 1132 struct vcpu_vmx *vmx = to_vmx(vcpu); 1133 1134 /* 1135 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1136 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1137 * full TLB flush from the guest's perspective. This is required even 1138 * if VPID is disabled in the host as KVM may need to synchronize the 1139 * MMU in response to the guest TLB flush. 1140 * 1141 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1142 * EPT is a special snowflake, as guest-physical mappings aren't 1143 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1144 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1145 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1146 * those mappings. 1147 */ 1148 if (!nested_cpu_has_vpid(vmcs12)) { 1149 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1150 return; 1151 } 1152 1153 /* L2 should never have a VPID if VPID is disabled. */ 1154 WARN_ON(!enable_vpid); 1155 1156 /* 1157 * If VPID is enabled and used by vmc12, but L2 does not have a unique 1158 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate 1159 * a VPID for L2, flush the current context as the effective ASID is 1160 * common to both L1 and L2. 1161 * 1162 * Defer the flush so that it runs after vmcs02.EPTP has been set by 1163 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid 1164 * redundant flushes further down the nested pipeline. 1165 * 1166 * If a TLB flush isn't required due to any of the above, and vpid12 is 1167 * changing then the new "virtual" VPID (vpid12) will reuse the same 1168 * "real" VPID (vpid02), and so needs to be flushed. There's no direct 1169 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for 1170 * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate 1171 * guest-physical mappings, so there is no need to sync the nEPT MMU. 1172 */ 1173 if (!nested_has_guest_tlb_tag(vcpu)) { 1174 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1175 } else if (is_vmenter && 1176 vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1177 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1178 vpid_sync_context(nested_get_vpid02(vcpu)); 1179 } 1180 } 1181 1182 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1183 { 1184 superset &= mask; 1185 subset &= mask; 1186 1187 return (superset | subset) == superset; 1188 } 1189 1190 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1191 { 1192 const u64 feature_and_reserved = 1193 /* feature (except bit 48; see below) */ 1194 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1195 /* reserved */ 1196 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1197 u64 vmx_basic = vmx->nested.msrs.basic; 1198 1199 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1200 return -EINVAL; 1201 1202 /* 1203 * KVM does not emulate a version of VMX that constrains physical 1204 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1205 */ 1206 if (data & BIT_ULL(48)) 1207 return -EINVAL; 1208 1209 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1210 vmx_basic_vmcs_revision_id(data)) 1211 return -EINVAL; 1212 1213 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1214 return -EINVAL; 1215 1216 vmx->nested.msrs.basic = data; 1217 return 0; 1218 } 1219 1220 static int 1221 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1222 { 1223 u64 supported; 1224 u32 *lowp, *highp; 1225 1226 switch (msr_index) { 1227 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1228 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1229 highp = &vmx->nested.msrs.pinbased_ctls_high; 1230 break; 1231 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1232 lowp = &vmx->nested.msrs.procbased_ctls_low; 1233 highp = &vmx->nested.msrs.procbased_ctls_high; 1234 break; 1235 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1236 lowp = &vmx->nested.msrs.exit_ctls_low; 1237 highp = &vmx->nested.msrs.exit_ctls_high; 1238 break; 1239 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1240 lowp = &vmx->nested.msrs.entry_ctls_low; 1241 highp = &vmx->nested.msrs.entry_ctls_high; 1242 break; 1243 case MSR_IA32_VMX_PROCBASED_CTLS2: 1244 lowp = &vmx->nested.msrs.secondary_ctls_low; 1245 highp = &vmx->nested.msrs.secondary_ctls_high; 1246 break; 1247 default: 1248 BUG(); 1249 } 1250 1251 supported = vmx_control_msr(*lowp, *highp); 1252 1253 /* Check must-be-1 bits are still 1. */ 1254 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1255 return -EINVAL; 1256 1257 /* Check must-be-0 bits are still 0. */ 1258 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1259 return -EINVAL; 1260 1261 *lowp = data; 1262 *highp = data >> 32; 1263 return 0; 1264 } 1265 1266 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1267 { 1268 const u64 feature_and_reserved_bits = 1269 /* feature */ 1270 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1271 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1272 /* reserved */ 1273 GENMASK_ULL(13, 9) | BIT_ULL(31); 1274 u64 vmx_misc; 1275 1276 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1277 vmx->nested.msrs.misc_high); 1278 1279 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1280 return -EINVAL; 1281 1282 if ((vmx->nested.msrs.pinbased_ctls_high & 1283 PIN_BASED_VMX_PREEMPTION_TIMER) && 1284 vmx_misc_preemption_timer_rate(data) != 1285 vmx_misc_preemption_timer_rate(vmx_misc)) 1286 return -EINVAL; 1287 1288 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1289 return -EINVAL; 1290 1291 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1292 return -EINVAL; 1293 1294 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1295 return -EINVAL; 1296 1297 vmx->nested.msrs.misc_low = data; 1298 vmx->nested.msrs.misc_high = data >> 32; 1299 1300 return 0; 1301 } 1302 1303 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1304 { 1305 u64 vmx_ept_vpid_cap; 1306 1307 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1308 vmx->nested.msrs.vpid_caps); 1309 1310 /* Every bit is either reserved or a feature bit. */ 1311 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1312 return -EINVAL; 1313 1314 vmx->nested.msrs.ept_caps = data; 1315 vmx->nested.msrs.vpid_caps = data >> 32; 1316 return 0; 1317 } 1318 1319 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1320 { 1321 u64 *msr; 1322 1323 switch (msr_index) { 1324 case MSR_IA32_VMX_CR0_FIXED0: 1325 msr = &vmx->nested.msrs.cr0_fixed0; 1326 break; 1327 case MSR_IA32_VMX_CR4_FIXED0: 1328 msr = &vmx->nested.msrs.cr4_fixed0; 1329 break; 1330 default: 1331 BUG(); 1332 } 1333 1334 /* 1335 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1336 * must be 1 in the restored value. 1337 */ 1338 if (!is_bitwise_subset(data, *msr, -1ULL)) 1339 return -EINVAL; 1340 1341 *msr = data; 1342 return 0; 1343 } 1344 1345 /* 1346 * Called when userspace is restoring VMX MSRs. 1347 * 1348 * Returns 0 on success, non-0 otherwise. 1349 */ 1350 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1351 { 1352 struct vcpu_vmx *vmx = to_vmx(vcpu); 1353 1354 /* 1355 * Don't allow changes to the VMX capability MSRs while the vCPU 1356 * is in VMX operation. 1357 */ 1358 if (vmx->nested.vmxon) 1359 return -EBUSY; 1360 1361 switch (msr_index) { 1362 case MSR_IA32_VMX_BASIC: 1363 return vmx_restore_vmx_basic(vmx, data); 1364 case MSR_IA32_VMX_PINBASED_CTLS: 1365 case MSR_IA32_VMX_PROCBASED_CTLS: 1366 case MSR_IA32_VMX_EXIT_CTLS: 1367 case MSR_IA32_VMX_ENTRY_CTLS: 1368 /* 1369 * The "non-true" VMX capability MSRs are generated from the 1370 * "true" MSRs, so we do not support restoring them directly. 1371 * 1372 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1373 * should restore the "true" MSRs with the must-be-1 bits 1374 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1375 * DEFAULT SETTINGS". 1376 */ 1377 return -EINVAL; 1378 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1379 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1380 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1381 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1382 case MSR_IA32_VMX_PROCBASED_CTLS2: 1383 return vmx_restore_control_msr(vmx, msr_index, data); 1384 case MSR_IA32_VMX_MISC: 1385 return vmx_restore_vmx_misc(vmx, data); 1386 case MSR_IA32_VMX_CR0_FIXED0: 1387 case MSR_IA32_VMX_CR4_FIXED0: 1388 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1389 case MSR_IA32_VMX_CR0_FIXED1: 1390 case MSR_IA32_VMX_CR4_FIXED1: 1391 /* 1392 * These MSRs are generated based on the vCPU's CPUID, so we 1393 * do not support restoring them directly. 1394 */ 1395 return -EINVAL; 1396 case MSR_IA32_VMX_EPT_VPID_CAP: 1397 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1398 case MSR_IA32_VMX_VMCS_ENUM: 1399 vmx->nested.msrs.vmcs_enum = data; 1400 return 0; 1401 case MSR_IA32_VMX_VMFUNC: 1402 if (data & ~vmx->nested.msrs.vmfunc_controls) 1403 return -EINVAL; 1404 vmx->nested.msrs.vmfunc_controls = data; 1405 return 0; 1406 default: 1407 /* 1408 * The rest of the VMX capability MSRs do not support restore. 1409 */ 1410 return -EINVAL; 1411 } 1412 } 1413 1414 /* Returns 0 on success, non-0 otherwise. */ 1415 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1416 { 1417 switch (msr_index) { 1418 case MSR_IA32_VMX_BASIC: 1419 *pdata = msrs->basic; 1420 break; 1421 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1422 case MSR_IA32_VMX_PINBASED_CTLS: 1423 *pdata = vmx_control_msr( 1424 msrs->pinbased_ctls_low, 1425 msrs->pinbased_ctls_high); 1426 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1427 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1428 break; 1429 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1430 case MSR_IA32_VMX_PROCBASED_CTLS: 1431 *pdata = vmx_control_msr( 1432 msrs->procbased_ctls_low, 1433 msrs->procbased_ctls_high); 1434 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1435 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1436 break; 1437 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1438 case MSR_IA32_VMX_EXIT_CTLS: 1439 *pdata = vmx_control_msr( 1440 msrs->exit_ctls_low, 1441 msrs->exit_ctls_high); 1442 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1443 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1444 break; 1445 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1446 case MSR_IA32_VMX_ENTRY_CTLS: 1447 *pdata = vmx_control_msr( 1448 msrs->entry_ctls_low, 1449 msrs->entry_ctls_high); 1450 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1451 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1452 break; 1453 case MSR_IA32_VMX_MISC: 1454 *pdata = vmx_control_msr( 1455 msrs->misc_low, 1456 msrs->misc_high); 1457 break; 1458 case MSR_IA32_VMX_CR0_FIXED0: 1459 *pdata = msrs->cr0_fixed0; 1460 break; 1461 case MSR_IA32_VMX_CR0_FIXED1: 1462 *pdata = msrs->cr0_fixed1; 1463 break; 1464 case MSR_IA32_VMX_CR4_FIXED0: 1465 *pdata = msrs->cr4_fixed0; 1466 break; 1467 case MSR_IA32_VMX_CR4_FIXED1: 1468 *pdata = msrs->cr4_fixed1; 1469 break; 1470 case MSR_IA32_VMX_VMCS_ENUM: 1471 *pdata = msrs->vmcs_enum; 1472 break; 1473 case MSR_IA32_VMX_PROCBASED_CTLS2: 1474 *pdata = vmx_control_msr( 1475 msrs->secondary_ctls_low, 1476 msrs->secondary_ctls_high); 1477 break; 1478 case MSR_IA32_VMX_EPT_VPID_CAP: 1479 *pdata = msrs->ept_caps | 1480 ((u64)msrs->vpid_caps << 32); 1481 break; 1482 case MSR_IA32_VMX_VMFUNC: 1483 *pdata = msrs->vmfunc_controls; 1484 break; 1485 default: 1486 return 1; 1487 } 1488 1489 return 0; 1490 } 1491 1492 /* 1493 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1494 * been modified by the L1 guest. Note, "writable" in this context means 1495 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1496 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1497 * VM-exit information fields (which are actually writable if the vCPU is 1498 * configured to support "VMWRITE to any supported field in the VMCS"). 1499 */ 1500 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1501 { 1502 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1503 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1504 struct shadow_vmcs_field field; 1505 unsigned long val; 1506 int i; 1507 1508 if (WARN_ON(!shadow_vmcs)) 1509 return; 1510 1511 preempt_disable(); 1512 1513 vmcs_load(shadow_vmcs); 1514 1515 for (i = 0; i < max_shadow_read_write_fields; i++) { 1516 field = shadow_read_write_fields[i]; 1517 val = __vmcs_readl(field.encoding); 1518 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1519 } 1520 1521 vmcs_clear(shadow_vmcs); 1522 vmcs_load(vmx->loaded_vmcs->vmcs); 1523 1524 preempt_enable(); 1525 } 1526 1527 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1528 { 1529 const struct shadow_vmcs_field *fields[] = { 1530 shadow_read_write_fields, 1531 shadow_read_only_fields 1532 }; 1533 const int max_fields[] = { 1534 max_shadow_read_write_fields, 1535 max_shadow_read_only_fields 1536 }; 1537 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1538 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1539 struct shadow_vmcs_field field; 1540 unsigned long val; 1541 int i, q; 1542 1543 if (WARN_ON(!shadow_vmcs)) 1544 return; 1545 1546 vmcs_load(shadow_vmcs); 1547 1548 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1549 for (i = 0; i < max_fields[q]; i++) { 1550 field = fields[q][i]; 1551 val = vmcs12_read_any(vmcs12, field.encoding, 1552 field.offset); 1553 __vmcs_writel(field.encoding, val); 1554 } 1555 } 1556 1557 vmcs_clear(shadow_vmcs); 1558 vmcs_load(vmx->loaded_vmcs->vmcs); 1559 } 1560 1561 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1562 { 1563 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1564 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1565 1566 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1567 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1568 vmcs12->guest_rip = evmcs->guest_rip; 1569 1570 if (unlikely(!(hv_clean_fields & 1571 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1572 vmcs12->guest_rsp = evmcs->guest_rsp; 1573 vmcs12->guest_rflags = evmcs->guest_rflags; 1574 vmcs12->guest_interruptibility_info = 1575 evmcs->guest_interruptibility_info; 1576 } 1577 1578 if (unlikely(!(hv_clean_fields & 1579 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1580 vmcs12->cpu_based_vm_exec_control = 1581 evmcs->cpu_based_vm_exec_control; 1582 } 1583 1584 if (unlikely(!(hv_clean_fields & 1585 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1586 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1587 } 1588 1589 if (unlikely(!(hv_clean_fields & 1590 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1591 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1592 } 1593 1594 if (unlikely(!(hv_clean_fields & 1595 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1596 vmcs12->vm_entry_intr_info_field = 1597 evmcs->vm_entry_intr_info_field; 1598 vmcs12->vm_entry_exception_error_code = 1599 evmcs->vm_entry_exception_error_code; 1600 vmcs12->vm_entry_instruction_len = 1601 evmcs->vm_entry_instruction_len; 1602 } 1603 1604 if (unlikely(!(hv_clean_fields & 1605 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1606 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1607 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1608 vmcs12->host_cr0 = evmcs->host_cr0; 1609 vmcs12->host_cr3 = evmcs->host_cr3; 1610 vmcs12->host_cr4 = evmcs->host_cr4; 1611 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1612 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1613 vmcs12->host_rip = evmcs->host_rip; 1614 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1615 vmcs12->host_es_selector = evmcs->host_es_selector; 1616 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1617 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1618 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1619 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1620 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1621 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1622 } 1623 1624 if (unlikely(!(hv_clean_fields & 1625 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1626 vmcs12->pin_based_vm_exec_control = 1627 evmcs->pin_based_vm_exec_control; 1628 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1629 vmcs12->secondary_vm_exec_control = 1630 evmcs->secondary_vm_exec_control; 1631 } 1632 1633 if (unlikely(!(hv_clean_fields & 1634 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1635 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1636 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1637 } 1638 1639 if (unlikely(!(hv_clean_fields & 1640 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1641 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1642 } 1643 1644 if (unlikely(!(hv_clean_fields & 1645 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1646 vmcs12->guest_es_base = evmcs->guest_es_base; 1647 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1648 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1649 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1650 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1651 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1652 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1653 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1654 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1655 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1656 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1657 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1658 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1659 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1660 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1661 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1662 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1663 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1664 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1665 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1666 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1667 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1668 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1669 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1670 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1671 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1672 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1673 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1674 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1675 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1676 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1677 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1678 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1679 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1680 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1681 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1682 } 1683 1684 if (unlikely(!(hv_clean_fields & 1685 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1686 vmcs12->tsc_offset = evmcs->tsc_offset; 1687 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1688 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1689 } 1690 1691 if (unlikely(!(hv_clean_fields & 1692 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1693 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1694 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1695 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1696 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1697 vmcs12->guest_cr0 = evmcs->guest_cr0; 1698 vmcs12->guest_cr3 = evmcs->guest_cr3; 1699 vmcs12->guest_cr4 = evmcs->guest_cr4; 1700 vmcs12->guest_dr7 = evmcs->guest_dr7; 1701 } 1702 1703 if (unlikely(!(hv_clean_fields & 1704 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1705 vmcs12->host_fs_base = evmcs->host_fs_base; 1706 vmcs12->host_gs_base = evmcs->host_gs_base; 1707 vmcs12->host_tr_base = evmcs->host_tr_base; 1708 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1709 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1710 vmcs12->host_rsp = evmcs->host_rsp; 1711 } 1712 1713 if (unlikely(!(hv_clean_fields & 1714 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1715 vmcs12->ept_pointer = evmcs->ept_pointer; 1716 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1717 } 1718 1719 if (unlikely(!(hv_clean_fields & 1720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1721 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1722 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1723 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1724 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1725 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1726 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1727 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1728 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1729 vmcs12->guest_pending_dbg_exceptions = 1730 evmcs->guest_pending_dbg_exceptions; 1731 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1732 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1733 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1734 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1735 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1736 } 1737 1738 /* 1739 * Not used? 1740 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1741 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1742 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1743 * vmcs12->page_fault_error_code_mask = 1744 * evmcs->page_fault_error_code_mask; 1745 * vmcs12->page_fault_error_code_match = 1746 * evmcs->page_fault_error_code_match; 1747 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1748 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1749 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1750 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1751 */ 1752 1753 /* 1754 * Read only fields: 1755 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1756 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1757 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1758 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1759 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1760 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1761 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1762 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1763 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1764 * vmcs12->exit_qualification = evmcs->exit_qualification; 1765 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1766 * 1767 * Not present in struct vmcs12: 1768 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1769 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1770 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1771 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1772 */ 1773 1774 return; 1775 } 1776 1777 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1778 { 1779 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1780 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1781 1782 /* 1783 * Should not be changed by KVM: 1784 * 1785 * evmcs->host_es_selector = vmcs12->host_es_selector; 1786 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1787 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1788 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1789 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1790 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1791 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1792 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1793 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1794 * evmcs->host_cr0 = vmcs12->host_cr0; 1795 * evmcs->host_cr3 = vmcs12->host_cr3; 1796 * evmcs->host_cr4 = vmcs12->host_cr4; 1797 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1798 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1799 * evmcs->host_rip = vmcs12->host_rip; 1800 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1801 * evmcs->host_fs_base = vmcs12->host_fs_base; 1802 * evmcs->host_gs_base = vmcs12->host_gs_base; 1803 * evmcs->host_tr_base = vmcs12->host_tr_base; 1804 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1805 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1806 * evmcs->host_rsp = vmcs12->host_rsp; 1807 * sync_vmcs02_to_vmcs12() doesn't read these: 1808 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1809 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1810 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1811 * evmcs->ept_pointer = vmcs12->ept_pointer; 1812 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1813 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1814 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1815 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1816 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1817 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1818 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1819 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1820 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1821 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1822 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1823 * evmcs->page_fault_error_code_mask = 1824 * vmcs12->page_fault_error_code_mask; 1825 * evmcs->page_fault_error_code_match = 1826 * vmcs12->page_fault_error_code_match; 1827 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1828 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1829 * evmcs->tsc_offset = vmcs12->tsc_offset; 1830 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1831 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1832 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1833 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1834 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1835 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1836 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1837 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1838 * 1839 * Not present in struct vmcs12: 1840 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1841 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1842 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1843 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1844 */ 1845 1846 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1847 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1848 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1849 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1850 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1851 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1852 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1853 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1854 1855 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1856 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1857 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1858 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1859 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1860 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1861 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1862 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1863 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1864 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1865 1866 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1867 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1868 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1869 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1870 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1871 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1872 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1873 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1874 1875 evmcs->guest_es_base = vmcs12->guest_es_base; 1876 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1877 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1878 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1879 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1880 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1881 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1882 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1883 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1884 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1885 1886 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1887 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1888 1889 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1890 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1891 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1892 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1893 1894 evmcs->guest_pending_dbg_exceptions = 1895 vmcs12->guest_pending_dbg_exceptions; 1896 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1897 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1898 1899 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1900 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1901 1902 evmcs->guest_cr0 = vmcs12->guest_cr0; 1903 evmcs->guest_cr3 = vmcs12->guest_cr3; 1904 evmcs->guest_cr4 = vmcs12->guest_cr4; 1905 evmcs->guest_dr7 = vmcs12->guest_dr7; 1906 1907 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1908 1909 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1910 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1911 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1912 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1913 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1914 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1915 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1916 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1917 1918 evmcs->exit_qualification = vmcs12->exit_qualification; 1919 1920 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1921 evmcs->guest_rsp = vmcs12->guest_rsp; 1922 evmcs->guest_rflags = vmcs12->guest_rflags; 1923 1924 evmcs->guest_interruptibility_info = 1925 vmcs12->guest_interruptibility_info; 1926 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1927 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1928 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1929 evmcs->vm_entry_exception_error_code = 1930 vmcs12->vm_entry_exception_error_code; 1931 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1932 1933 evmcs->guest_rip = vmcs12->guest_rip; 1934 1935 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1936 1937 return; 1938 } 1939 1940 /* 1941 * This is an equivalent of the nested hypervisor executing the vmptrld 1942 * instruction. 1943 */ 1944 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1945 struct kvm_vcpu *vcpu, bool from_launch) 1946 { 1947 struct vcpu_vmx *vmx = to_vmx(vcpu); 1948 bool evmcs_gpa_changed = false; 1949 u64 evmcs_gpa; 1950 1951 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1952 return EVMPTRLD_DISABLED; 1953 1954 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) { 1955 nested_release_evmcs(vcpu); 1956 return EVMPTRLD_DISABLED; 1957 } 1958 1959 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1960 vmx->nested.current_vmptr = -1ull; 1961 1962 nested_release_evmcs(vcpu); 1963 1964 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1965 &vmx->nested.hv_evmcs_map)) 1966 return EVMPTRLD_ERROR; 1967 1968 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1969 1970 /* 1971 * Currently, KVM only supports eVMCS version 1 1972 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 1973 * value to first u32 field of eVMCS which should specify eVMCS 1974 * VersionNumber. 1975 * 1976 * Guest should be aware of supported eVMCS versions by host by 1977 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 1978 * expected to set this CPUID leaf according to the value 1979 * returned in vmcs_version from nested_enable_evmcs(). 1980 * 1981 * However, it turns out that Microsoft Hyper-V fails to comply 1982 * to their own invented interface: When Hyper-V use eVMCS, it 1983 * just sets first u32 field of eVMCS to revision_id specified 1984 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 1985 * which is one of the supported versions specified in 1986 * CPUID.0x4000000A.EAX[0:15]. 1987 * 1988 * To overcome Hyper-V bug, we accept here either a supported 1989 * eVMCS version or VMCS12 revision_id as valid values for first 1990 * u32 field of eVMCS. 1991 */ 1992 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 1993 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 1994 nested_release_evmcs(vcpu); 1995 return EVMPTRLD_VMFAIL; 1996 } 1997 1998 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 1999 2000 evmcs_gpa_changed = true; 2001 /* 2002 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2003 * reloaded from guest's memory (read only fields, fields not 2004 * present in struct hv_enlightened_vmcs, ...). Make sure there 2005 * are no leftovers. 2006 */ 2007 if (from_launch) { 2008 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2009 memset(vmcs12, 0, sizeof(*vmcs12)); 2010 vmcs12->hdr.revision_id = VMCS12_REVISION; 2011 } 2012 2013 } 2014 2015 /* 2016 * Clean fields data can't be used on VMLAUNCH and when we switch 2017 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2018 */ 2019 if (from_launch || evmcs_gpa_changed) 2020 vmx->nested.hv_evmcs->hv_clean_fields &= 2021 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2022 2023 return EVMPTRLD_SUCCEEDED; 2024 } 2025 2026 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2027 { 2028 struct vcpu_vmx *vmx = to_vmx(vcpu); 2029 2030 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2031 copy_vmcs12_to_enlightened(vmx); 2032 else 2033 copy_vmcs12_to_shadow(vmx); 2034 2035 vmx->nested.need_vmcs12_to_shadow_sync = false; 2036 } 2037 2038 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2039 { 2040 struct vcpu_vmx *vmx = 2041 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2042 2043 vmx->nested.preemption_timer_expired = true; 2044 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2045 kvm_vcpu_kick(&vmx->vcpu); 2046 2047 return HRTIMER_NORESTART; 2048 } 2049 2050 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2051 { 2052 struct vcpu_vmx *vmx = to_vmx(vcpu); 2053 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2054 2055 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2056 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2057 2058 if (!vmx->nested.has_preemption_timer_deadline) { 2059 vmx->nested.preemption_timer_deadline = 2060 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2061 vmx->nested.has_preemption_timer_deadline = true; 2062 } 2063 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2064 } 2065 2066 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2067 u64 preemption_timeout) 2068 { 2069 struct vcpu_vmx *vmx = to_vmx(vcpu); 2070 2071 /* 2072 * A timer value of zero is architecturally guaranteed to cause 2073 * a VMExit prior to executing any instructions in the guest. 2074 */ 2075 if (preemption_timeout == 0) { 2076 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2077 return; 2078 } 2079 2080 if (vcpu->arch.virtual_tsc_khz == 0) 2081 return; 2082 2083 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2084 preemption_timeout *= 1000000; 2085 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2086 hrtimer_start(&vmx->nested.preemption_timer, 2087 ktime_add_ns(ktime_get(), preemption_timeout), 2088 HRTIMER_MODE_ABS_PINNED); 2089 } 2090 2091 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2092 { 2093 if (vmx->nested.nested_run_pending && 2094 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2095 return vmcs12->guest_ia32_efer; 2096 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2097 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2098 else 2099 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2100 } 2101 2102 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2103 { 2104 /* 2105 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2106 * according to L0's settings (vmcs12 is irrelevant here). Host 2107 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2108 * will be set as needed prior to VMLAUNCH/VMRESUME. 2109 */ 2110 if (vmx->nested.vmcs02_initialized) 2111 return; 2112 vmx->nested.vmcs02_initialized = true; 2113 2114 /* 2115 * We don't care what the EPTP value is we just need to guarantee 2116 * it's valid so we don't get a false positive when doing early 2117 * consistency checks. 2118 */ 2119 if (enable_ept && nested_early_check) 2120 vmcs_write64(EPT_POINTER, 2121 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2122 2123 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2124 if (cpu_has_vmx_vmfunc()) 2125 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2126 2127 if (cpu_has_vmx_posted_intr()) 2128 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2129 2130 if (cpu_has_vmx_msr_bitmap()) 2131 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2132 2133 /* 2134 * PML is emulated for L2, but never enabled in hardware as the MMU 2135 * handles A/D emulation. Disabling PML for L2 also avoids having to 2136 * deal with filtering out L2 GPAs from the buffer. 2137 */ 2138 if (enable_pml) { 2139 vmcs_write64(PML_ADDRESS, 0); 2140 vmcs_write16(GUEST_PML_INDEX, -1); 2141 } 2142 2143 if (cpu_has_vmx_encls_vmexit()) 2144 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 2145 2146 /* 2147 * Set the MSR load/store lists to match L0's settings. Only the 2148 * addresses are constant (for vmcs02), the counts can change based 2149 * on L2's behavior, e.g. switching to/from long mode. 2150 */ 2151 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2152 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2153 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2154 2155 vmx_set_constant_host_state(vmx); 2156 } 2157 2158 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2159 struct vmcs12 *vmcs12) 2160 { 2161 prepare_vmcs02_constant_state(vmx); 2162 2163 vmcs_write64(VMCS_LINK_POINTER, -1ull); 2164 2165 if (enable_vpid) { 2166 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2167 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2168 else 2169 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2170 } 2171 } 2172 2173 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2174 { 2175 u32 exec_control; 2176 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2177 2178 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2179 prepare_vmcs02_early_rare(vmx, vmcs12); 2180 2181 /* 2182 * PIN CONTROLS 2183 */ 2184 exec_control = vmx_pin_based_exec_ctrl(vmx); 2185 exec_control |= (vmcs12->pin_based_vm_exec_control & 2186 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2187 2188 /* Posted interrupts setting is only taken from vmcs12. */ 2189 if (nested_cpu_has_posted_intr(vmcs12)) { 2190 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2191 vmx->nested.pi_pending = false; 2192 } else { 2193 exec_control &= ~PIN_BASED_POSTED_INTR; 2194 } 2195 pin_controls_set(vmx, exec_control); 2196 2197 /* 2198 * EXEC CONTROLS 2199 */ 2200 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2201 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2202 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2203 exec_control &= ~CPU_BASED_TPR_SHADOW; 2204 exec_control |= vmcs12->cpu_based_vm_exec_control; 2205 2206 vmx->nested.l1_tpr_threshold = -1; 2207 if (exec_control & CPU_BASED_TPR_SHADOW) 2208 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2209 #ifdef CONFIG_X86_64 2210 else 2211 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2212 CPU_BASED_CR8_STORE_EXITING; 2213 #endif 2214 2215 /* 2216 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2217 * for I/O port accesses. 2218 */ 2219 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2220 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2221 2222 /* 2223 * This bit will be computed in nested_get_vmcs12_pages, because 2224 * we do not have access to L1's MSR bitmap yet. For now, keep 2225 * the same bit as before, hoping to avoid multiple VMWRITEs that 2226 * only set/clear this bit. 2227 */ 2228 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2229 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2230 2231 exec_controls_set(vmx, exec_control); 2232 2233 /* 2234 * SECONDARY EXEC CONTROLS 2235 */ 2236 if (cpu_has_secondary_exec_ctrls()) { 2237 exec_control = vmx->secondary_exec_control; 2238 2239 /* Take the following fields only from vmcs12 */ 2240 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2241 SECONDARY_EXEC_ENABLE_INVPCID | 2242 SECONDARY_EXEC_ENABLE_RDTSCP | 2243 SECONDARY_EXEC_XSAVES | 2244 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2245 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2246 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2247 SECONDARY_EXEC_ENABLE_VMFUNC | 2248 SECONDARY_EXEC_TSC_SCALING); 2249 if (nested_cpu_has(vmcs12, 2250 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2251 exec_control |= vmcs12->secondary_vm_exec_control; 2252 2253 /* PML is emulated and never enabled in hardware for L2. */ 2254 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2255 2256 /* VMCS shadowing for L2 is emulated for now */ 2257 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2258 2259 /* 2260 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2261 * will not have to rewrite the controls just for this bit. 2262 */ 2263 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2264 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2265 exec_control |= SECONDARY_EXEC_DESC; 2266 2267 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2268 vmcs_write16(GUEST_INTR_STATUS, 2269 vmcs12->guest_intr_status); 2270 2271 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2272 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2273 2274 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2275 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2276 2277 secondary_exec_controls_set(vmx, exec_control); 2278 } 2279 2280 /* 2281 * ENTRY CONTROLS 2282 * 2283 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2284 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2285 * on the related bits (if supported by the CPU) in the hope that 2286 * we can avoid VMWrites during vmx_set_efer(). 2287 */ 2288 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & 2289 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; 2290 if (cpu_has_load_ia32_efer()) { 2291 if (guest_efer & EFER_LMA) 2292 exec_control |= VM_ENTRY_IA32E_MODE; 2293 if (guest_efer != host_efer) 2294 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2295 } 2296 vm_entry_controls_set(vmx, exec_control); 2297 2298 /* 2299 * EXIT CONTROLS 2300 * 2301 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2302 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2303 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2304 */ 2305 exec_control = vmx_vmexit_ctrl(); 2306 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2307 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2308 vm_exit_controls_set(vmx, exec_control); 2309 2310 /* 2311 * Interrupt/Exception Fields 2312 */ 2313 if (vmx->nested.nested_run_pending) { 2314 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2315 vmcs12->vm_entry_intr_info_field); 2316 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2317 vmcs12->vm_entry_exception_error_code); 2318 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2319 vmcs12->vm_entry_instruction_len); 2320 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2321 vmcs12->guest_interruptibility_info); 2322 vmx->loaded_vmcs->nmi_known_unmasked = 2323 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2324 } else { 2325 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2326 } 2327 } 2328 2329 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2330 { 2331 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2332 2333 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2334 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2335 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2336 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2337 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2338 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2339 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2340 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2341 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2342 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2343 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2344 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2345 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2346 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2347 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2348 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2349 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2350 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2351 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2352 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2353 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2354 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2355 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2356 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2357 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2358 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2359 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2360 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2361 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2362 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2363 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2364 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2365 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2366 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2367 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2368 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2369 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2370 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2371 2372 vmx->segment_cache.bitmask = 0; 2373 } 2374 2375 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2376 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2377 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2378 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2379 vmcs12->guest_pending_dbg_exceptions); 2380 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2381 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2382 2383 /* 2384 * L1 may access the L2's PDPTR, so save them to construct 2385 * vmcs12 2386 */ 2387 if (enable_ept) { 2388 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2389 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2390 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2391 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2392 } 2393 2394 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2395 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2396 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2397 } 2398 2399 if (nested_cpu_has_xsaves(vmcs12)) 2400 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2401 2402 /* 2403 * Whether page-faults are trapped is determined by a combination of 2404 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2405 * doesn't care about page faults then we should set all of these to 2406 * L1's desires. However, if L0 does care about (some) page faults, it 2407 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2408 * simply ask to exit on each and every L2 page fault. This is done by 2409 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2410 * Note that below we don't need special code to set EB.PF beyond the 2411 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2412 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2413 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2414 */ 2415 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2416 /* 2417 * TODO: if both L0 and L1 need the same MASK and MATCH, 2418 * go ahead and use it? 2419 */ 2420 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2421 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2422 } else { 2423 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2424 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2425 } 2426 2427 if (cpu_has_vmx_apicv()) { 2428 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2429 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2430 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2431 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2432 } 2433 2434 /* 2435 * Make sure the msr_autostore list is up to date before we set the 2436 * count in the vmcs02. 2437 */ 2438 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2439 2440 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2441 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2442 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2443 2444 set_cr4_guest_host_mask(vmx); 2445 } 2446 2447 /* 2448 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2449 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2450 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2451 * guest in a way that will both be appropriate to L1's requests, and our 2452 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2453 * function also has additional necessary side-effects, like setting various 2454 * vcpu->arch fields. 2455 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2456 * is assigned to entry_failure_code on failure. 2457 */ 2458 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2459 bool from_vmentry, 2460 enum vm_entry_failure_code *entry_failure_code) 2461 { 2462 struct vcpu_vmx *vmx = to_vmx(vcpu); 2463 bool load_guest_pdptrs_vmcs12 = false; 2464 2465 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 2466 prepare_vmcs02_rare(vmx, vmcs12); 2467 vmx->nested.dirty_vmcs12 = false; 2468 2469 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || 2470 !(vmx->nested.hv_evmcs->hv_clean_fields & 2471 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2472 } 2473 2474 if (vmx->nested.nested_run_pending && 2475 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2476 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2477 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2478 } else { 2479 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2480 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2481 } 2482 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2483 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2484 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2485 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2486 2487 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2488 * bitwise-or of what L1 wants to trap for L2, and what we want to 2489 * trap. Note that CR0.TS also needs updating - we do this later. 2490 */ 2491 vmx_update_exception_bitmap(vcpu); 2492 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2493 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2494 2495 if (vmx->nested.nested_run_pending && 2496 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2497 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2498 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2499 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2500 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2501 } 2502 2503 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2504 vcpu->arch.l1_tsc_offset, 2505 vmx_get_l2_tsc_offset(vcpu), 2506 vmx_get_l2_tsc_multiplier(vcpu)); 2507 2508 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2509 vcpu->arch.l1_tsc_scaling_ratio, 2510 vmx_get_l2_tsc_multiplier(vcpu)); 2511 2512 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2513 if (kvm_has_tsc_control) 2514 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2515 2516 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2517 2518 if (nested_cpu_has_ept(vmcs12)) 2519 nested_ept_init_mmu_context(vcpu); 2520 2521 /* 2522 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2523 * bits which we consider mandatory enabled. 2524 * The CR0_READ_SHADOW is what L2 should have expected to read given 2525 * the specifications by L1; It's not enough to take 2526 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2527 * have more bits than L1 expected. 2528 */ 2529 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2530 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2531 2532 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2533 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2534 2535 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2536 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2537 vmx_set_efer(vcpu, vcpu->arch.efer); 2538 2539 /* 2540 * Guest state is invalid and unrestricted guest is disabled, 2541 * which means L1 attempted VMEntry to L2 with invalid state. 2542 * Fail the VMEntry. 2543 */ 2544 if (CC(!vmx_guest_state_valid(vcpu))) { 2545 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2546 return -EINVAL; 2547 } 2548 2549 /* Shadow page tables on either EPT or shadow page tables. */ 2550 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2551 from_vmentry, entry_failure_code)) 2552 return -EINVAL; 2553 2554 /* 2555 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2556 * on nested VM-Exit, which can occur without actually running L2 and 2557 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2558 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2559 * transition to HLT instead of running L2. 2560 */ 2561 if (enable_ept) 2562 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2563 2564 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2565 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2566 is_pae_paging(vcpu)) { 2567 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2568 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2569 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2570 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2571 } 2572 2573 if (!enable_ept) 2574 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2575 2576 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2577 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2578 vmcs12->guest_ia32_perf_global_ctrl))) 2579 return -EINVAL; 2580 2581 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2582 kvm_rip_write(vcpu, vmcs12->guest_rip); 2583 2584 /* 2585 * It was observed that genuine Hyper-V running in L1 doesn't reset 2586 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2587 * bits when it changes a field in eVMCS. Mark all fields as clean 2588 * here. 2589 */ 2590 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2591 vmx->nested.hv_evmcs->hv_clean_fields |= 2592 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2593 2594 return 0; 2595 } 2596 2597 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2598 { 2599 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2600 nested_cpu_has_virtual_nmis(vmcs12))) 2601 return -EINVAL; 2602 2603 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2604 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2605 return -EINVAL; 2606 2607 return 0; 2608 } 2609 2610 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2611 { 2612 struct vcpu_vmx *vmx = to_vmx(vcpu); 2613 2614 /* Check for memory type validity */ 2615 switch (new_eptp & VMX_EPTP_MT_MASK) { 2616 case VMX_EPTP_MT_UC: 2617 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2618 return false; 2619 break; 2620 case VMX_EPTP_MT_WB: 2621 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2622 return false; 2623 break; 2624 default: 2625 return false; 2626 } 2627 2628 /* Page-walk levels validity. */ 2629 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2630 case VMX_EPTP_PWL_5: 2631 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2632 return false; 2633 break; 2634 case VMX_EPTP_PWL_4: 2635 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2636 return false; 2637 break; 2638 default: 2639 return false; 2640 } 2641 2642 /* Reserved bits should not be set */ 2643 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2644 return false; 2645 2646 /* AD, if set, should be supported */ 2647 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2648 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2649 return false; 2650 } 2651 2652 return true; 2653 } 2654 2655 /* 2656 * Checks related to VM-Execution Control Fields 2657 */ 2658 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2659 struct vmcs12 *vmcs12) 2660 { 2661 struct vcpu_vmx *vmx = to_vmx(vcpu); 2662 2663 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2664 vmx->nested.msrs.pinbased_ctls_low, 2665 vmx->nested.msrs.pinbased_ctls_high)) || 2666 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2667 vmx->nested.msrs.procbased_ctls_low, 2668 vmx->nested.msrs.procbased_ctls_high))) 2669 return -EINVAL; 2670 2671 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2672 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2673 vmx->nested.msrs.secondary_ctls_low, 2674 vmx->nested.msrs.secondary_ctls_high))) 2675 return -EINVAL; 2676 2677 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2678 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2679 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2680 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2681 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2682 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2683 nested_vmx_check_nmi_controls(vmcs12) || 2684 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2685 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2686 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2687 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2688 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2689 return -EINVAL; 2690 2691 if (!nested_cpu_has_preemption_timer(vmcs12) && 2692 nested_cpu_has_save_preemption_timer(vmcs12)) 2693 return -EINVAL; 2694 2695 if (nested_cpu_has_ept(vmcs12) && 2696 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2697 return -EINVAL; 2698 2699 if (nested_cpu_has_vmfunc(vmcs12)) { 2700 if (CC(vmcs12->vm_function_control & 2701 ~vmx->nested.msrs.vmfunc_controls)) 2702 return -EINVAL; 2703 2704 if (nested_cpu_has_eptp_switching(vmcs12)) { 2705 if (CC(!nested_cpu_has_ept(vmcs12)) || 2706 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2707 return -EINVAL; 2708 } 2709 } 2710 2711 return 0; 2712 } 2713 2714 /* 2715 * Checks related to VM-Exit Control Fields 2716 */ 2717 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2718 struct vmcs12 *vmcs12) 2719 { 2720 struct vcpu_vmx *vmx = to_vmx(vcpu); 2721 2722 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2723 vmx->nested.msrs.exit_ctls_low, 2724 vmx->nested.msrs.exit_ctls_high)) || 2725 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2726 return -EINVAL; 2727 2728 return 0; 2729 } 2730 2731 /* 2732 * Checks related to VM-Entry Control Fields 2733 */ 2734 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2735 struct vmcs12 *vmcs12) 2736 { 2737 struct vcpu_vmx *vmx = to_vmx(vcpu); 2738 2739 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2740 vmx->nested.msrs.entry_ctls_low, 2741 vmx->nested.msrs.entry_ctls_high))) 2742 return -EINVAL; 2743 2744 /* 2745 * From the Intel SDM, volume 3: 2746 * Fields relevant to VM-entry event injection must be set properly. 2747 * These fields are the VM-entry interruption-information field, the 2748 * VM-entry exception error code, and the VM-entry instruction length. 2749 */ 2750 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2751 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2752 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2753 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2754 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2755 bool should_have_error_code; 2756 bool urg = nested_cpu_has2(vmcs12, 2757 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2758 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2759 2760 /* VM-entry interruption-info field: interruption type */ 2761 if (CC(intr_type == INTR_TYPE_RESERVED) || 2762 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2763 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2764 return -EINVAL; 2765 2766 /* VM-entry interruption-info field: vector */ 2767 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2768 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2769 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2770 return -EINVAL; 2771 2772 /* VM-entry interruption-info field: deliver error code */ 2773 should_have_error_code = 2774 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2775 x86_exception_has_error_code(vector); 2776 if (CC(has_error_code != should_have_error_code)) 2777 return -EINVAL; 2778 2779 /* VM-entry exception error code */ 2780 if (CC(has_error_code && 2781 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2782 return -EINVAL; 2783 2784 /* VM-entry interruption-info field: reserved bits */ 2785 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2786 return -EINVAL; 2787 2788 /* VM-entry instruction length */ 2789 switch (intr_type) { 2790 case INTR_TYPE_SOFT_EXCEPTION: 2791 case INTR_TYPE_SOFT_INTR: 2792 case INTR_TYPE_PRIV_SW_EXCEPTION: 2793 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2794 CC(vmcs12->vm_entry_instruction_len == 0 && 2795 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2796 return -EINVAL; 2797 } 2798 } 2799 2800 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2801 return -EINVAL; 2802 2803 return 0; 2804 } 2805 2806 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2807 struct vmcs12 *vmcs12) 2808 { 2809 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2810 nested_check_vm_exit_controls(vcpu, vmcs12) || 2811 nested_check_vm_entry_controls(vcpu, vmcs12)) 2812 return -EINVAL; 2813 2814 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) 2815 return nested_evmcs_check_controls(vmcs12); 2816 2817 return 0; 2818 } 2819 2820 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2821 struct vmcs12 *vmcs12) 2822 { 2823 bool ia32e; 2824 2825 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2826 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2827 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2828 return -EINVAL; 2829 2830 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2831 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2832 return -EINVAL; 2833 2834 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2835 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2836 return -EINVAL; 2837 2838 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2839 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2840 vmcs12->host_ia32_perf_global_ctrl))) 2841 return -EINVAL; 2842 2843 #ifdef CONFIG_X86_64 2844 ia32e = !!(vcpu->arch.efer & EFER_LMA); 2845 #else 2846 ia32e = false; 2847 #endif 2848 2849 if (ia32e) { 2850 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || 2851 CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2852 return -EINVAL; 2853 } else { 2854 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || 2855 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2856 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2857 CC((vmcs12->host_rip) >> 32)) 2858 return -EINVAL; 2859 } 2860 2861 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2862 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2863 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2864 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2865 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2866 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2867 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2868 CC(vmcs12->host_cs_selector == 0) || 2869 CC(vmcs12->host_tr_selector == 0) || 2870 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2871 return -EINVAL; 2872 2873 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2874 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2875 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2876 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2877 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2878 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2879 return -EINVAL; 2880 2881 /* 2882 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2883 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2884 * the values of the LMA and LME bits in the field must each be that of 2885 * the host address-space size VM-exit control. 2886 */ 2887 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2888 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2889 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2890 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2891 return -EINVAL; 2892 } 2893 2894 return 0; 2895 } 2896 2897 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2898 struct vmcs12 *vmcs12) 2899 { 2900 int r = 0; 2901 struct vmcs12 *shadow; 2902 struct kvm_host_map map; 2903 2904 if (vmcs12->vmcs_link_pointer == -1ull) 2905 return 0; 2906 2907 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2908 return -EINVAL; 2909 2910 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2911 return -EINVAL; 2912 2913 shadow = map.hva; 2914 2915 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2916 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2917 r = -EINVAL; 2918 2919 kvm_vcpu_unmap(vcpu, &map, false); 2920 return r; 2921 } 2922 2923 /* 2924 * Checks related to Guest Non-register State 2925 */ 2926 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2927 { 2928 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2929 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 2930 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 2931 return -EINVAL; 2932 2933 return 0; 2934 } 2935 2936 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2937 struct vmcs12 *vmcs12, 2938 enum vm_entry_failure_code *entry_failure_code) 2939 { 2940 bool ia32e; 2941 2942 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2943 2944 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2945 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2946 return -EINVAL; 2947 2948 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 2949 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 2950 return -EINVAL; 2951 2952 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2953 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2954 return -EINVAL; 2955 2956 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2957 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 2958 return -EINVAL; 2959 } 2960 2961 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2962 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2963 vmcs12->guest_ia32_perf_global_ctrl))) 2964 return -EINVAL; 2965 2966 /* 2967 * If the load IA32_EFER VM-entry control is 1, the following checks 2968 * are performed on the field for the IA32_EFER MSR: 2969 * - Bits reserved in the IA32_EFER MSR must be 0. 2970 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2971 * the IA-32e mode guest VM-exit control. It must also be identical 2972 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2973 * CR0.PG) is 1. 2974 */ 2975 if (to_vmx(vcpu)->nested.nested_run_pending && 2976 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2977 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2978 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2979 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2980 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2981 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2982 return -EINVAL; 2983 } 2984 2985 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2986 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 2987 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 2988 return -EINVAL; 2989 2990 if (nested_check_guest_non_reg_state(vmcs12)) 2991 return -EINVAL; 2992 2993 return 0; 2994 } 2995 2996 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 2997 { 2998 struct vcpu_vmx *vmx = to_vmx(vcpu); 2999 unsigned long cr3, cr4; 3000 bool vm_fail; 3001 3002 if (!nested_early_check) 3003 return 0; 3004 3005 if (vmx->msr_autoload.host.nr) 3006 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3007 if (vmx->msr_autoload.guest.nr) 3008 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3009 3010 preempt_disable(); 3011 3012 vmx_prepare_switch_to_guest(vcpu); 3013 3014 /* 3015 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3016 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3017 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3018 * there is no need to preserve other bits or save/restore the field. 3019 */ 3020 vmcs_writel(GUEST_RFLAGS, 0); 3021 3022 cr3 = __get_current_cr3_fast(); 3023 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3024 vmcs_writel(HOST_CR3, cr3); 3025 vmx->loaded_vmcs->host_state.cr3 = cr3; 3026 } 3027 3028 cr4 = cr4_read_shadow(); 3029 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3030 vmcs_writel(HOST_CR4, cr4); 3031 vmx->loaded_vmcs->host_state.cr4 = cr4; 3032 } 3033 3034 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3035 vmx->loaded_vmcs->launched); 3036 3037 if (vmx->msr_autoload.host.nr) 3038 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3039 if (vmx->msr_autoload.guest.nr) 3040 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3041 3042 if (vm_fail) { 3043 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3044 3045 preempt_enable(); 3046 3047 trace_kvm_nested_vmenter_failed( 3048 "early hardware check VM-instruction error: ", error); 3049 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3050 return 1; 3051 } 3052 3053 /* 3054 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3055 */ 3056 if (hw_breakpoint_active()) 3057 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3058 local_irq_enable(); 3059 preempt_enable(); 3060 3061 /* 3062 * A non-failing VMEntry means we somehow entered guest mode with 3063 * an illegal RIP, and that's just the tip of the iceberg. There 3064 * is no telling what memory has been modified or what state has 3065 * been exposed to unknown code. Hitting this all but guarantees 3066 * a (very critical) hardware issue. 3067 */ 3068 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3069 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3070 3071 return 0; 3072 } 3073 3074 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3075 { 3076 struct vcpu_vmx *vmx = to_vmx(vcpu); 3077 3078 /* 3079 * hv_evmcs may end up being not mapped after migration (when 3080 * L2 was running), map it here to make sure vmcs12 changes are 3081 * properly reflected. 3082 */ 3083 if (vmx->nested.enlightened_vmcs_enabled && 3084 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3085 enum nested_evmptrld_status evmptrld_status = 3086 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3087 3088 if (evmptrld_status == EVMPTRLD_VMFAIL || 3089 evmptrld_status == EVMPTRLD_ERROR) 3090 return false; 3091 3092 /* 3093 * Post migration VMCS12 always provides the most actual 3094 * information, copy it to eVMCS upon entry. 3095 */ 3096 vmx->nested.need_vmcs12_to_shadow_sync = true; 3097 } 3098 3099 return true; 3100 } 3101 3102 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3103 { 3104 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3105 struct vcpu_vmx *vmx = to_vmx(vcpu); 3106 struct kvm_host_map *map; 3107 struct page *page; 3108 u64 hpa; 3109 3110 if (!vcpu->arch.pdptrs_from_userspace && 3111 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3112 /* 3113 * Reload the guest's PDPTRs since after a migration 3114 * the guest CR3 might be restored prior to setting the nested 3115 * state which can lead to a load of wrong PDPTRs. 3116 */ 3117 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))) 3118 return false; 3119 } 3120 3121 3122 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3123 /* 3124 * Translate L1 physical address to host physical 3125 * address for vmcs02. Keep the page pinned, so this 3126 * physical address remains valid. We keep a reference 3127 * to it so we can release it later. 3128 */ 3129 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 3130 kvm_release_page_clean(vmx->nested.apic_access_page); 3131 vmx->nested.apic_access_page = NULL; 3132 } 3133 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 3134 if (!is_error_page(page)) { 3135 vmx->nested.apic_access_page = page; 3136 hpa = page_to_phys(vmx->nested.apic_access_page); 3137 vmcs_write64(APIC_ACCESS_ADDR, hpa); 3138 } else { 3139 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 3140 __func__); 3141 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3142 vcpu->run->internal.suberror = 3143 KVM_INTERNAL_ERROR_EMULATION; 3144 vcpu->run->internal.ndata = 0; 3145 return false; 3146 } 3147 } 3148 3149 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3150 map = &vmx->nested.virtual_apic_map; 3151 3152 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3153 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3154 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3155 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3156 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3157 /* 3158 * The processor will never use the TPR shadow, simply 3159 * clear the bit from the execution control. Such a 3160 * configuration is useless, but it happens in tests. 3161 * For any other configuration, failing the vm entry is 3162 * _not_ what the processor does but it's basically the 3163 * only possibility we have. 3164 */ 3165 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3166 } else { 3167 /* 3168 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3169 * force VM-Entry to fail. 3170 */ 3171 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 3172 } 3173 } 3174 3175 if (nested_cpu_has_posted_intr(vmcs12)) { 3176 map = &vmx->nested.pi_desc_map; 3177 3178 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3179 vmx->nested.pi_desc = 3180 (struct pi_desc *)(((void *)map->hva) + 3181 offset_in_page(vmcs12->posted_intr_desc_addr)); 3182 vmcs_write64(POSTED_INTR_DESC_ADDR, 3183 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3184 } else { 3185 /* 3186 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3187 * access the contents of the VMCS12 posted interrupt 3188 * descriptor. (Note that KVM may do this when it 3189 * should not, per the architectural specification.) 3190 */ 3191 vmx->nested.pi_desc = NULL; 3192 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3193 } 3194 } 3195 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3196 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3197 else 3198 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3199 3200 return true; 3201 } 3202 3203 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3204 { 3205 if (!nested_get_evmcs_page(vcpu)) { 3206 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3207 __func__); 3208 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3209 vcpu->run->internal.suberror = 3210 KVM_INTERNAL_ERROR_EMULATION; 3211 vcpu->run->internal.ndata = 0; 3212 3213 return false; 3214 } 3215 3216 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3217 return false; 3218 3219 return true; 3220 } 3221 3222 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3223 { 3224 struct vmcs12 *vmcs12; 3225 struct vcpu_vmx *vmx = to_vmx(vcpu); 3226 gpa_t dst; 3227 3228 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3229 return 0; 3230 3231 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3232 return 1; 3233 3234 /* 3235 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3236 * set is already checked as part of A/D emulation. 3237 */ 3238 vmcs12 = get_vmcs12(vcpu); 3239 if (!nested_cpu_has_pml(vmcs12)) 3240 return 0; 3241 3242 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3243 vmx->nested.pml_full = true; 3244 return 1; 3245 } 3246 3247 gpa &= ~0xFFFull; 3248 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3249 3250 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3251 offset_in_page(dst), sizeof(gpa))) 3252 return 0; 3253 3254 vmcs12->guest_pml_index--; 3255 3256 return 0; 3257 } 3258 3259 /* 3260 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3261 * for running VMX instructions (except VMXON, whose prerequisites are 3262 * slightly different). It also specifies what exception to inject otherwise. 3263 * Note that many of these exceptions have priority over VM exits, so they 3264 * don't have to be checked again here. 3265 */ 3266 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3267 { 3268 if (!to_vmx(vcpu)->nested.vmxon) { 3269 kvm_queue_exception(vcpu, UD_VECTOR); 3270 return 0; 3271 } 3272 3273 if (vmx_get_cpl(vcpu)) { 3274 kvm_inject_gp(vcpu, 0); 3275 return 0; 3276 } 3277 3278 return 1; 3279 } 3280 3281 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3282 { 3283 u8 rvi = vmx_get_rvi(); 3284 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3285 3286 return ((rvi & 0xf0) > (vppr & 0xf0)); 3287 } 3288 3289 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3290 struct vmcs12 *vmcs12); 3291 3292 /* 3293 * If from_vmentry is false, this is being called from state restore (either RSM 3294 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3295 * 3296 * Returns: 3297 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3298 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3299 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3300 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3301 */ 3302 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3303 bool from_vmentry) 3304 { 3305 struct vcpu_vmx *vmx = to_vmx(vcpu); 3306 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3307 enum vm_entry_failure_code entry_failure_code; 3308 bool evaluate_pending_interrupts; 3309 union vmx_exit_reason exit_reason = { 3310 .basic = EXIT_REASON_INVALID_STATE, 3311 .failed_vmentry = 1, 3312 }; 3313 u32 failed_index; 3314 3315 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 3316 kvm_vcpu_flush_tlb_current(vcpu); 3317 3318 evaluate_pending_interrupts = exec_controls_get(vmx) & 3319 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3320 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3321 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3322 3323 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3324 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3325 if (kvm_mpx_supported() && 3326 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3327 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3328 3329 /* 3330 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3331 * nested early checks are disabled. In the event of a "late" VM-Fail, 3332 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3333 * software model to the pre-VMEntry host state. When EPT is disabled, 3334 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3335 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3336 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3337 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3338 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3339 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3340 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3341 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3342 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3343 * path would need to manually save/restore vmcs01.GUEST_CR3. 3344 */ 3345 if (!enable_ept && !nested_early_check) 3346 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3347 3348 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3349 3350 prepare_vmcs02_early(vmx, vmcs12); 3351 3352 if (from_vmentry) { 3353 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3354 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3355 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3356 } 3357 3358 if (nested_vmx_check_vmentry_hw(vcpu)) { 3359 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3360 return NVMX_VMENTRY_VMFAIL; 3361 } 3362 3363 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3364 &entry_failure_code)) { 3365 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3366 vmcs12->exit_qualification = entry_failure_code; 3367 goto vmentry_fail_vmexit; 3368 } 3369 } 3370 3371 enter_guest_mode(vcpu); 3372 3373 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3374 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3375 vmcs12->exit_qualification = entry_failure_code; 3376 goto vmentry_fail_vmexit_guest_mode; 3377 } 3378 3379 if (from_vmentry) { 3380 failed_index = nested_vmx_load_msr(vcpu, 3381 vmcs12->vm_entry_msr_load_addr, 3382 vmcs12->vm_entry_msr_load_count); 3383 if (failed_index) { 3384 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3385 vmcs12->exit_qualification = failed_index; 3386 goto vmentry_fail_vmexit_guest_mode; 3387 } 3388 } else { 3389 /* 3390 * The MMU is not initialized to point at the right entities yet and 3391 * "get pages" would need to read data from the guest (i.e. we will 3392 * need to perform gpa to hpa translation). Request a call 3393 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3394 * have already been set at vmentry time and should not be reset. 3395 */ 3396 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3397 } 3398 3399 /* 3400 * If L1 had a pending IRQ/NMI until it executed 3401 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3402 * disallowed (e.g. interrupts disabled), L0 needs to 3403 * evaluate if this pending event should cause an exit from L2 3404 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3405 * intercept EXTERNAL_INTERRUPT). 3406 * 3407 * Usually this would be handled by the processor noticing an 3408 * IRQ/NMI window request, or checking RVI during evaluation of 3409 * pending virtual interrupts. However, this setting was done 3410 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3411 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3412 */ 3413 if (unlikely(evaluate_pending_interrupts)) 3414 kvm_make_request(KVM_REQ_EVENT, vcpu); 3415 3416 /* 3417 * Do not start the preemption timer hrtimer until after we know 3418 * we are successful, so that only nested_vmx_vmexit needs to cancel 3419 * the timer. 3420 */ 3421 vmx->nested.preemption_timer_expired = false; 3422 if (nested_cpu_has_preemption_timer(vmcs12)) { 3423 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3424 vmx_start_preemption_timer(vcpu, timer_value); 3425 } 3426 3427 /* 3428 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3429 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3430 * returned as far as L1 is concerned. It will only return (and set 3431 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3432 */ 3433 return NVMX_VMENTRY_SUCCESS; 3434 3435 /* 3436 * A failed consistency check that leads to a VMExit during L1's 3437 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3438 * 26.7 "VM-entry failures during or after loading guest state". 3439 */ 3440 vmentry_fail_vmexit_guest_mode: 3441 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3442 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3443 leave_guest_mode(vcpu); 3444 3445 vmentry_fail_vmexit: 3446 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3447 3448 if (!from_vmentry) 3449 return NVMX_VMENTRY_VMEXIT; 3450 3451 load_vmcs12_host_state(vcpu, vmcs12); 3452 vmcs12->vm_exit_reason = exit_reason.full; 3453 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 3454 vmx->nested.need_vmcs12_to_shadow_sync = true; 3455 return NVMX_VMENTRY_VMEXIT; 3456 } 3457 3458 /* 3459 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3460 * for running an L2 nested guest. 3461 */ 3462 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3463 { 3464 struct vmcs12 *vmcs12; 3465 enum nvmx_vmentry_status status; 3466 struct vcpu_vmx *vmx = to_vmx(vcpu); 3467 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3468 enum nested_evmptrld_status evmptrld_status; 3469 3470 if (!nested_vmx_check_permission(vcpu)) 3471 return 1; 3472 3473 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3474 if (evmptrld_status == EVMPTRLD_ERROR) { 3475 kvm_queue_exception(vcpu, UD_VECTOR); 3476 return 1; 3477 } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { 3478 return nested_vmx_failInvalid(vcpu); 3479 } 3480 3481 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && 3482 vmx->nested.current_vmptr == -1ull)) 3483 return nested_vmx_failInvalid(vcpu); 3484 3485 vmcs12 = get_vmcs12(vcpu); 3486 3487 /* 3488 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3489 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3490 * rather than RFLAGS.ZF, and no error number is stored to the 3491 * VM-instruction error field. 3492 */ 3493 if (CC(vmcs12->hdr.shadow_vmcs)) 3494 return nested_vmx_failInvalid(vcpu); 3495 3496 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 3497 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); 3498 /* Enlightened VMCS doesn't have launch state */ 3499 vmcs12->launch_state = !launch; 3500 } else if (enable_shadow_vmcs) { 3501 copy_shadow_to_vmcs12(vmx); 3502 } 3503 3504 /* 3505 * The nested entry process starts with enforcing various prerequisites 3506 * on vmcs12 as required by the Intel SDM, and act appropriately when 3507 * they fail: As the SDM explains, some conditions should cause the 3508 * instruction to fail, while others will cause the instruction to seem 3509 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3510 * To speed up the normal (success) code path, we should avoid checking 3511 * for misconfigurations which will anyway be caught by the processor 3512 * when using the merged vmcs02. 3513 */ 3514 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3515 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3516 3517 if (CC(vmcs12->launch_state == launch)) 3518 return nested_vmx_fail(vcpu, 3519 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3520 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3521 3522 if (nested_vmx_check_controls(vcpu, vmcs12)) 3523 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3524 3525 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3526 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3527 3528 /* 3529 * We're finally done with prerequisite checking, and can start with 3530 * the nested entry. 3531 */ 3532 vmx->nested.nested_run_pending = 1; 3533 vmx->nested.has_preemption_timer_deadline = false; 3534 status = nested_vmx_enter_non_root_mode(vcpu, true); 3535 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3536 goto vmentry_failed; 3537 3538 /* Emulate processing of posted interrupts on VM-Enter. */ 3539 if (nested_cpu_has_posted_intr(vmcs12) && 3540 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3541 vmx->nested.pi_pending = true; 3542 kvm_make_request(KVM_REQ_EVENT, vcpu); 3543 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3544 } 3545 3546 /* Hide L1D cache contents from the nested guest. */ 3547 vmx->vcpu.arch.l1tf_flush_l1d = true; 3548 3549 /* 3550 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3551 * also be used as part of restoring nVMX state for 3552 * snapshot restore (migration). 3553 * 3554 * In this flow, it is assumed that vmcs12 cache was 3555 * transferred as part of captured nVMX state and should 3556 * therefore not be read from guest memory (which may not 3557 * exist on destination host yet). 3558 */ 3559 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3560 3561 switch (vmcs12->guest_activity_state) { 3562 case GUEST_ACTIVITY_HLT: 3563 /* 3564 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3565 * awakened by event injection or by an NMI-window VM-exit or 3566 * by an interrupt-window VM-exit, halt the vcpu. 3567 */ 3568 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3569 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3570 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3571 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3572 vmx->nested.nested_run_pending = 0; 3573 return kvm_vcpu_halt(vcpu); 3574 } 3575 break; 3576 case GUEST_ACTIVITY_WAIT_SIPI: 3577 vmx->nested.nested_run_pending = 0; 3578 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3579 break; 3580 default: 3581 break; 3582 } 3583 3584 return 1; 3585 3586 vmentry_failed: 3587 vmx->nested.nested_run_pending = 0; 3588 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3589 return 0; 3590 if (status == NVMX_VMENTRY_VMEXIT) 3591 return 1; 3592 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3593 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3594 } 3595 3596 /* 3597 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3598 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3599 * This function returns the new value we should put in vmcs12.guest_cr0. 3600 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3601 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3602 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3603 * didn't trap the bit, because if L1 did, so would L0). 3604 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3605 * been modified by L2, and L1 knows it. So just leave the old value of 3606 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3607 * isn't relevant, because if L0 traps this bit it can set it to anything. 3608 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3609 * changed these bits, and therefore they need to be updated, but L0 3610 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3611 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3612 */ 3613 static inline unsigned long 3614 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3615 { 3616 return 3617 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3618 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3619 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3620 vcpu->arch.cr0_guest_owned_bits)); 3621 } 3622 3623 static inline unsigned long 3624 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3625 { 3626 return 3627 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3628 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3629 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3630 vcpu->arch.cr4_guest_owned_bits)); 3631 } 3632 3633 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3634 struct vmcs12 *vmcs12) 3635 { 3636 u32 idt_vectoring; 3637 unsigned int nr; 3638 3639 if (vcpu->arch.exception.injected) { 3640 nr = vcpu->arch.exception.nr; 3641 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3642 3643 if (kvm_exception_is_soft(nr)) { 3644 vmcs12->vm_exit_instruction_len = 3645 vcpu->arch.event_exit_inst_len; 3646 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3647 } else 3648 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3649 3650 if (vcpu->arch.exception.has_error_code) { 3651 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3652 vmcs12->idt_vectoring_error_code = 3653 vcpu->arch.exception.error_code; 3654 } 3655 3656 vmcs12->idt_vectoring_info_field = idt_vectoring; 3657 } else if (vcpu->arch.nmi_injected) { 3658 vmcs12->idt_vectoring_info_field = 3659 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3660 } else if (vcpu->arch.interrupt.injected) { 3661 nr = vcpu->arch.interrupt.nr; 3662 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3663 3664 if (vcpu->arch.interrupt.soft) { 3665 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3666 vmcs12->vm_entry_instruction_len = 3667 vcpu->arch.event_exit_inst_len; 3668 } else 3669 idt_vectoring |= INTR_TYPE_EXT_INTR; 3670 3671 vmcs12->idt_vectoring_info_field = idt_vectoring; 3672 } 3673 } 3674 3675 3676 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3677 { 3678 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3679 gfn_t gfn; 3680 3681 /* 3682 * Don't need to mark the APIC access page dirty; it is never 3683 * written to by the CPU during APIC virtualization. 3684 */ 3685 3686 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3687 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3688 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3689 } 3690 3691 if (nested_cpu_has_posted_intr(vmcs12)) { 3692 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3693 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3694 } 3695 } 3696 3697 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3698 { 3699 struct vcpu_vmx *vmx = to_vmx(vcpu); 3700 int max_irr; 3701 void *vapic_page; 3702 u16 status; 3703 3704 if (!vmx->nested.pi_pending) 3705 return 0; 3706 3707 if (!vmx->nested.pi_desc) 3708 goto mmio_needed; 3709 3710 vmx->nested.pi_pending = false; 3711 3712 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3713 return 0; 3714 3715 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3716 if (max_irr != 256) { 3717 vapic_page = vmx->nested.virtual_apic_map.hva; 3718 if (!vapic_page) 3719 goto mmio_needed; 3720 3721 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3722 vapic_page, &max_irr); 3723 status = vmcs_read16(GUEST_INTR_STATUS); 3724 if ((u8)max_irr > ((u8)status & 0xff)) { 3725 status &= ~0xff; 3726 status |= (u8)max_irr; 3727 vmcs_write16(GUEST_INTR_STATUS, status); 3728 } 3729 } 3730 3731 nested_mark_vmcs12_pages_dirty(vcpu); 3732 return 0; 3733 3734 mmio_needed: 3735 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3736 return -ENXIO; 3737 } 3738 3739 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3740 unsigned long exit_qual) 3741 { 3742 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3743 unsigned int nr = vcpu->arch.exception.nr; 3744 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3745 3746 if (vcpu->arch.exception.has_error_code) { 3747 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3748 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3749 } 3750 3751 if (kvm_exception_is_soft(nr)) 3752 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3753 else 3754 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3755 3756 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3757 vmx_get_nmi_mask(vcpu)) 3758 intr_info |= INTR_INFO_UNBLOCK_NMI; 3759 3760 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3761 } 3762 3763 /* 3764 * Returns true if a debug trap is pending delivery. 3765 * 3766 * In KVM, debug traps bear an exception payload. As such, the class of a #DB 3767 * exception may be inferred from the presence of an exception payload. 3768 */ 3769 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) 3770 { 3771 return vcpu->arch.exception.pending && 3772 vcpu->arch.exception.nr == DB_VECTOR && 3773 vcpu->arch.exception.payload; 3774 } 3775 3776 /* 3777 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3778 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3779 * represents these debug traps with a payload that is said to be compatible 3780 * with the 'pending debug exceptions' field, write the payload to the VMCS 3781 * field if a VM-exit is delivered before the debug trap. 3782 */ 3783 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3784 { 3785 if (vmx_pending_dbg_trap(vcpu)) 3786 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 3787 vcpu->arch.exception.payload); 3788 } 3789 3790 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3791 { 3792 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3793 to_vmx(vcpu)->nested.preemption_timer_expired; 3794 } 3795 3796 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 3797 { 3798 struct vcpu_vmx *vmx = to_vmx(vcpu); 3799 unsigned long exit_qual; 3800 bool block_nested_events = 3801 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3802 bool mtf_pending = vmx->nested.mtf_pending; 3803 struct kvm_lapic *apic = vcpu->arch.apic; 3804 3805 /* 3806 * Clear the MTF state. If a higher priority VM-exit is delivered first, 3807 * this state is discarded. 3808 */ 3809 if (!block_nested_events) 3810 vmx->nested.mtf_pending = false; 3811 3812 if (lapic_in_kernel(vcpu) && 3813 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3814 if (block_nested_events) 3815 return -EBUSY; 3816 nested_vmx_update_pending_dbg(vcpu); 3817 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3818 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 3819 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3820 return 0; 3821 } 3822 3823 if (lapic_in_kernel(vcpu) && 3824 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 3825 if (block_nested_events) 3826 return -EBUSY; 3827 3828 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 3829 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 3830 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 3831 apic->sipi_vector & 0xFFUL); 3832 return 0; 3833 } 3834 3835 /* 3836 * Process any exceptions that are not debug traps before MTF. 3837 * 3838 * Note that only a pending nested run can block a pending exception. 3839 * Otherwise an injected NMI/interrupt should either be 3840 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, 3841 * while delivering the pending exception. 3842 */ 3843 3844 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { 3845 if (vmx->nested.nested_run_pending) 3846 return -EBUSY; 3847 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3848 goto no_vmexit; 3849 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3850 return 0; 3851 } 3852 3853 if (mtf_pending) { 3854 if (block_nested_events) 3855 return -EBUSY; 3856 nested_vmx_update_pending_dbg(vcpu); 3857 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 3858 return 0; 3859 } 3860 3861 if (vcpu->arch.exception.pending) { 3862 if (vmx->nested.nested_run_pending) 3863 return -EBUSY; 3864 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3865 goto no_vmexit; 3866 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3867 return 0; 3868 } 3869 3870 if (nested_vmx_preemption_timer_pending(vcpu)) { 3871 if (block_nested_events) 3872 return -EBUSY; 3873 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3874 return 0; 3875 } 3876 3877 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 3878 if (block_nested_events) 3879 return -EBUSY; 3880 goto no_vmexit; 3881 } 3882 3883 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 3884 if (block_nested_events) 3885 return -EBUSY; 3886 if (!nested_exit_on_nmi(vcpu)) 3887 goto no_vmexit; 3888 3889 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3890 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3891 INTR_INFO_VALID_MASK, 0); 3892 /* 3893 * The NMI-triggered VM exit counts as injection: 3894 * clear this one and block further NMIs. 3895 */ 3896 vcpu->arch.nmi_pending = 0; 3897 vmx_set_nmi_mask(vcpu, true); 3898 return 0; 3899 } 3900 3901 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 3902 if (block_nested_events) 3903 return -EBUSY; 3904 if (!nested_exit_on_intr(vcpu)) 3905 goto no_vmexit; 3906 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3907 return 0; 3908 } 3909 3910 no_vmexit: 3911 return vmx_complete_nested_posted_interrupt(vcpu); 3912 } 3913 3914 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3915 { 3916 ktime_t remaining = 3917 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3918 u64 value; 3919 3920 if (ktime_to_ns(remaining) <= 0) 3921 return 0; 3922 3923 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3924 do_div(value, 1000000); 3925 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3926 } 3927 3928 static bool is_vmcs12_ext_field(unsigned long field) 3929 { 3930 switch (field) { 3931 case GUEST_ES_SELECTOR: 3932 case GUEST_CS_SELECTOR: 3933 case GUEST_SS_SELECTOR: 3934 case GUEST_DS_SELECTOR: 3935 case GUEST_FS_SELECTOR: 3936 case GUEST_GS_SELECTOR: 3937 case GUEST_LDTR_SELECTOR: 3938 case GUEST_TR_SELECTOR: 3939 case GUEST_ES_LIMIT: 3940 case GUEST_CS_LIMIT: 3941 case GUEST_SS_LIMIT: 3942 case GUEST_DS_LIMIT: 3943 case GUEST_FS_LIMIT: 3944 case GUEST_GS_LIMIT: 3945 case GUEST_LDTR_LIMIT: 3946 case GUEST_TR_LIMIT: 3947 case GUEST_GDTR_LIMIT: 3948 case GUEST_IDTR_LIMIT: 3949 case GUEST_ES_AR_BYTES: 3950 case GUEST_DS_AR_BYTES: 3951 case GUEST_FS_AR_BYTES: 3952 case GUEST_GS_AR_BYTES: 3953 case GUEST_LDTR_AR_BYTES: 3954 case GUEST_TR_AR_BYTES: 3955 case GUEST_ES_BASE: 3956 case GUEST_CS_BASE: 3957 case GUEST_SS_BASE: 3958 case GUEST_DS_BASE: 3959 case GUEST_FS_BASE: 3960 case GUEST_GS_BASE: 3961 case GUEST_LDTR_BASE: 3962 case GUEST_TR_BASE: 3963 case GUEST_GDTR_BASE: 3964 case GUEST_IDTR_BASE: 3965 case GUEST_PENDING_DBG_EXCEPTIONS: 3966 case GUEST_BNDCFGS: 3967 return true; 3968 default: 3969 break; 3970 } 3971 3972 return false; 3973 } 3974 3975 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3976 struct vmcs12 *vmcs12) 3977 { 3978 struct vcpu_vmx *vmx = to_vmx(vcpu); 3979 3980 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3981 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3982 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3983 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3984 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3985 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3986 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 3987 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 3988 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 3989 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 3990 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 3991 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 3992 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 3993 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 3994 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 3995 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 3996 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3997 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3998 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3999 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4000 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4001 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4002 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4003 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4004 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4005 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4006 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4007 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4008 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4009 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4010 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4011 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4012 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4013 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4014 vmcs12->guest_pending_dbg_exceptions = 4015 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4016 if (kvm_mpx_supported()) 4017 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 4018 4019 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4020 } 4021 4022 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4023 struct vmcs12 *vmcs12) 4024 { 4025 struct vcpu_vmx *vmx = to_vmx(vcpu); 4026 int cpu; 4027 4028 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4029 return; 4030 4031 4032 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4033 4034 cpu = get_cpu(); 4035 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4036 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4037 4038 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4039 4040 vmx->loaded_vmcs = &vmx->vmcs01; 4041 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4042 put_cpu(); 4043 } 4044 4045 /* 4046 * Update the guest state fields of vmcs12 to reflect changes that 4047 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4048 * VM-entry controls is also updated, since this is really a guest 4049 * state bit.) 4050 */ 4051 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4052 { 4053 struct vcpu_vmx *vmx = to_vmx(vcpu); 4054 4055 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 4056 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4057 4058 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4059 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); 4060 4061 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4062 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4063 4064 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4065 vmcs12->guest_rip = kvm_rip_read(vcpu); 4066 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4067 4068 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4069 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4070 4071 vmcs12->guest_interruptibility_info = 4072 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4073 4074 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4075 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4076 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4077 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4078 else 4079 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4080 4081 if (nested_cpu_has_preemption_timer(vmcs12) && 4082 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4083 !vmx->nested.nested_run_pending) 4084 vmcs12->vmx_preemption_timer_value = 4085 vmx_get_preemption_timer_value(vcpu); 4086 4087 /* 4088 * In some cases (usually, nested EPT), L2 is allowed to change its 4089 * own CR3 without exiting. If it has changed it, we must keep it. 4090 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4091 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4092 * 4093 * Additionally, restore L2's PDPTR to vmcs12. 4094 */ 4095 if (enable_ept) { 4096 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4097 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4098 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4099 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4100 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4101 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4102 } 4103 } 4104 4105 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4106 4107 if (nested_cpu_has_vid(vmcs12)) 4108 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4109 4110 vmcs12->vm_entry_controls = 4111 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4112 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4113 4114 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4115 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4116 4117 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4118 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4119 } 4120 4121 /* 4122 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4123 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4124 * and this function updates it to reflect the changes to the guest state while 4125 * L2 was running (and perhaps made some exits which were handled directly by L0 4126 * without going back to L1), and to reflect the exit reason. 4127 * Note that we do not have to copy here all VMCS fields, just those that 4128 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4129 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4130 * which already writes to vmcs12 directly. 4131 */ 4132 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4133 u32 vm_exit_reason, u32 exit_intr_info, 4134 unsigned long exit_qualification) 4135 { 4136 /* update exit information fields: */ 4137 vmcs12->vm_exit_reason = vm_exit_reason; 4138 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4139 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4140 vmcs12->exit_qualification = exit_qualification; 4141 vmcs12->vm_exit_intr_info = exit_intr_info; 4142 4143 vmcs12->idt_vectoring_info_field = 0; 4144 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4145 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4146 4147 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4148 vmcs12->launch_state = 1; 4149 4150 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4151 * instead of reading the real value. */ 4152 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4153 4154 /* 4155 * Transfer the event that L0 or L1 may wanted to inject into 4156 * L2 to IDT_VECTORING_INFO_FIELD. 4157 */ 4158 vmcs12_save_pending_event(vcpu, vmcs12); 4159 4160 /* 4161 * According to spec, there's no need to store the guest's 4162 * MSRs if the exit is due to a VM-entry failure that occurs 4163 * during or after loading the guest state. Since this exit 4164 * does not fall in that category, we need to save the MSRs. 4165 */ 4166 if (nested_vmx_store_msr(vcpu, 4167 vmcs12->vm_exit_msr_store_addr, 4168 vmcs12->vm_exit_msr_store_count)) 4169 nested_vmx_abort(vcpu, 4170 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4171 } 4172 4173 /* 4174 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 4175 * preserved above and would only end up incorrectly in L1. 4176 */ 4177 vcpu->arch.nmi_injected = false; 4178 kvm_clear_exception_queue(vcpu); 4179 kvm_clear_interrupt_queue(vcpu); 4180 } 4181 4182 /* 4183 * A part of what we need to when the nested L2 guest exits and we want to 4184 * run its L1 parent, is to reset L1's guest state to the host state specified 4185 * in vmcs12. 4186 * This function is to be called not only on normal nested exit, but also on 4187 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4188 * Failures During or After Loading Guest State"). 4189 * This function should be called when the active VMCS is L1's (vmcs01). 4190 */ 4191 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4192 struct vmcs12 *vmcs12) 4193 { 4194 enum vm_entry_failure_code ignored; 4195 struct kvm_segment seg; 4196 4197 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4198 vcpu->arch.efer = vmcs12->host_ia32_efer; 4199 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4200 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4201 else 4202 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4203 vmx_set_efer(vcpu, vcpu->arch.efer); 4204 4205 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4206 kvm_rip_write(vcpu, vmcs12->host_rip); 4207 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4208 vmx_set_interrupt_shadow(vcpu, 0); 4209 4210 /* 4211 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4212 * actually changed, because vmx_set_cr0 refers to efer set above. 4213 * 4214 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4215 * (KVM doesn't change it); 4216 */ 4217 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4218 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4219 4220 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4221 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4222 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4223 4224 nested_ept_uninit_mmu_context(vcpu); 4225 4226 /* 4227 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4228 * couldn't have changed. 4229 */ 4230 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4231 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4232 4233 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4234 4235 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4236 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4237 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4238 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4239 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4240 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4241 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4242 4243 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4244 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4245 vmcs_write64(GUEST_BNDCFGS, 0); 4246 4247 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4248 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4249 vcpu->arch.pat = vmcs12->host_ia32_pat; 4250 } 4251 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 4252 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4253 vmcs12->host_ia32_perf_global_ctrl)); 4254 4255 /* Set L1 segment info according to Intel SDM 4256 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4257 seg = (struct kvm_segment) { 4258 .base = 0, 4259 .limit = 0xFFFFFFFF, 4260 .selector = vmcs12->host_cs_selector, 4261 .type = 11, 4262 .present = 1, 4263 .s = 1, 4264 .g = 1 4265 }; 4266 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4267 seg.l = 1; 4268 else 4269 seg.db = 1; 4270 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4271 seg = (struct kvm_segment) { 4272 .base = 0, 4273 .limit = 0xFFFFFFFF, 4274 .type = 3, 4275 .present = 1, 4276 .s = 1, 4277 .db = 1, 4278 .g = 1 4279 }; 4280 seg.selector = vmcs12->host_ds_selector; 4281 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4282 seg.selector = vmcs12->host_es_selector; 4283 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4284 seg.selector = vmcs12->host_ss_selector; 4285 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4286 seg.selector = vmcs12->host_fs_selector; 4287 seg.base = vmcs12->host_fs_base; 4288 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4289 seg.selector = vmcs12->host_gs_selector; 4290 seg.base = vmcs12->host_gs_base; 4291 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4292 seg = (struct kvm_segment) { 4293 .base = vmcs12->host_tr_base, 4294 .limit = 0x67, 4295 .selector = vmcs12->host_tr_selector, 4296 .type = 11, 4297 .present = 1 4298 }; 4299 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4300 4301 kvm_set_dr(vcpu, 7, 0x400); 4302 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4303 4304 if (cpu_has_vmx_msr_bitmap()) 4305 vmx_update_msr_bitmap(vcpu); 4306 4307 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4308 vmcs12->vm_exit_msr_load_count)) 4309 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4310 } 4311 4312 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4313 { 4314 struct vmx_uret_msr *efer_msr; 4315 unsigned int i; 4316 4317 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4318 return vmcs_read64(GUEST_IA32_EFER); 4319 4320 if (cpu_has_load_ia32_efer()) 4321 return host_efer; 4322 4323 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4324 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4325 return vmx->msr_autoload.guest.val[i].value; 4326 } 4327 4328 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4329 if (efer_msr) 4330 return efer_msr->data; 4331 4332 return host_efer; 4333 } 4334 4335 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4336 { 4337 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4338 struct vcpu_vmx *vmx = to_vmx(vcpu); 4339 struct vmx_msr_entry g, h; 4340 gpa_t gpa; 4341 u32 i, j; 4342 4343 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4344 4345 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4346 /* 4347 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4348 * as vmcs01.GUEST_DR7 contains a userspace defined value 4349 * and vcpu->arch.dr7 is not squirreled away before the 4350 * nested VMENTER (not worth adding a variable in nested_vmx). 4351 */ 4352 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4353 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4354 else 4355 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4356 } 4357 4358 /* 4359 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4360 * handle a variety of side effects to KVM's software model. 4361 */ 4362 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4363 4364 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4365 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4366 4367 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4368 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4369 4370 nested_ept_uninit_mmu_context(vcpu); 4371 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4372 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4373 4374 /* 4375 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4376 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4377 * VMFail, like everything else we just need to ensure our 4378 * software model is up-to-date. 4379 */ 4380 if (enable_ept && is_pae_paging(vcpu)) 4381 ept_save_pdptrs(vcpu); 4382 4383 kvm_mmu_reset_context(vcpu); 4384 4385 if (cpu_has_vmx_msr_bitmap()) 4386 vmx_update_msr_bitmap(vcpu); 4387 4388 /* 4389 * This nasty bit of open coding is a compromise between blindly 4390 * loading L1's MSRs using the exit load lists (incorrect emulation 4391 * of VMFail), leaving the nested VM's MSRs in the software model 4392 * (incorrect behavior) and snapshotting the modified MSRs (too 4393 * expensive since the lists are unbound by hardware). For each 4394 * MSR that was (prematurely) loaded from the nested VMEntry load 4395 * list, reload it from the exit load list if it exists and differs 4396 * from the guest value. The intent is to stuff host state as 4397 * silently as possible, not to fully process the exit load list. 4398 */ 4399 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4400 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4401 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4402 pr_debug_ratelimited( 4403 "%s read MSR index failed (%u, 0x%08llx)\n", 4404 __func__, i, gpa); 4405 goto vmabort; 4406 } 4407 4408 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4409 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4410 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4411 pr_debug_ratelimited( 4412 "%s read MSR failed (%u, 0x%08llx)\n", 4413 __func__, j, gpa); 4414 goto vmabort; 4415 } 4416 if (h.index != g.index) 4417 continue; 4418 if (h.value == g.value) 4419 break; 4420 4421 if (nested_vmx_load_msr_check(vcpu, &h)) { 4422 pr_debug_ratelimited( 4423 "%s check failed (%u, 0x%x, 0x%x)\n", 4424 __func__, j, h.index, h.reserved); 4425 goto vmabort; 4426 } 4427 4428 if (kvm_set_msr(vcpu, h.index, h.value)) { 4429 pr_debug_ratelimited( 4430 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4431 __func__, j, h.index, h.value); 4432 goto vmabort; 4433 } 4434 } 4435 } 4436 4437 return; 4438 4439 vmabort: 4440 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4441 } 4442 4443 /* 4444 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4445 * and modify vmcs12 to make it see what it would expect to see there if 4446 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4447 */ 4448 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4449 u32 exit_intr_info, unsigned long exit_qualification) 4450 { 4451 struct vcpu_vmx *vmx = to_vmx(vcpu); 4452 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4453 4454 /* trying to cancel vmlaunch/vmresume is a bug */ 4455 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4456 4457 /* Similarly, triple faults in L2 should never escape. */ 4458 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); 4459 4460 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4461 /* 4462 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4463 * Enlightened VMCS after migration and we still need to 4464 * do that when something is forcing L2->L1 exit prior to 4465 * the first L2 run. 4466 */ 4467 (void)nested_get_evmcs_page(vcpu); 4468 } 4469 4470 /* Service the TLB flush request for L2 before switching to L1. */ 4471 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 4472 kvm_vcpu_flush_tlb_current(vcpu); 4473 4474 /* 4475 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4476 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4477 * up-to-date before switching to L1. 4478 */ 4479 if (enable_ept && is_pae_paging(vcpu)) 4480 vmx_ept_load_pdptrs(vcpu); 4481 4482 leave_guest_mode(vcpu); 4483 4484 if (nested_cpu_has_preemption_timer(vmcs12)) 4485 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4486 4487 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4488 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4489 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4490 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4491 } 4492 4493 if (likely(!vmx->fail)) { 4494 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4495 4496 if (vm_exit_reason != -1) 4497 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4498 exit_intr_info, exit_qualification); 4499 4500 /* 4501 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4502 * also be used to capture vmcs12 cache as part of 4503 * capturing nVMX state for snapshot (migration). 4504 * 4505 * Otherwise, this flush will dirty guest memory at a 4506 * point it is already assumed by user-space to be 4507 * immutable. 4508 */ 4509 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4510 } else { 4511 /* 4512 * The only expected VM-instruction error is "VM entry with 4513 * invalid control field(s)." Anything else indicates a 4514 * problem with L0. And we should never get here with a 4515 * VMFail of any type if early consistency checks are enabled. 4516 */ 4517 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4518 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4519 WARN_ON_ONCE(nested_early_check); 4520 } 4521 4522 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4523 4524 /* Update any VMCS fields that might have changed while L2 ran */ 4525 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4526 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4527 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4528 if (kvm_has_tsc_control) 4529 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4530 4531 if (vmx->nested.l1_tpr_threshold != -1) 4532 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4533 4534 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4535 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4536 vmx_set_virtual_apic_mode(vcpu); 4537 } 4538 4539 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4540 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4541 vmx_update_cpu_dirty_logging(vcpu); 4542 } 4543 4544 /* Unpin physical memory we referred to in vmcs02 */ 4545 if (vmx->nested.apic_access_page) { 4546 kvm_release_page_clean(vmx->nested.apic_access_page); 4547 vmx->nested.apic_access_page = NULL; 4548 } 4549 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4550 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4551 vmx->nested.pi_desc = NULL; 4552 4553 if (vmx->nested.reload_vmcs01_apic_access_page) { 4554 vmx->nested.reload_vmcs01_apic_access_page = false; 4555 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4556 } 4557 4558 if ((vm_exit_reason != -1) && 4559 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) 4560 vmx->nested.need_vmcs12_to_shadow_sync = true; 4561 4562 /* in case we halted in L2 */ 4563 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4564 4565 if (likely(!vmx->fail)) { 4566 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4567 nested_exit_intr_ack_set(vcpu)) { 4568 int irq = kvm_cpu_get_interrupt(vcpu); 4569 WARN_ON(irq < 0); 4570 vmcs12->vm_exit_intr_info = irq | 4571 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4572 } 4573 4574 if (vm_exit_reason != -1) 4575 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4576 vmcs12->exit_qualification, 4577 vmcs12->idt_vectoring_info_field, 4578 vmcs12->vm_exit_intr_info, 4579 vmcs12->vm_exit_intr_error_code, 4580 KVM_ISA_VMX); 4581 4582 load_vmcs12_host_state(vcpu, vmcs12); 4583 4584 return; 4585 } 4586 4587 /* 4588 * After an early L2 VM-entry failure, we're now back 4589 * in L1 which thinks it just finished a VMLAUNCH or 4590 * VMRESUME instruction, so we need to set the failure 4591 * flag and the VM-instruction error field of the VMCS 4592 * accordingly, and skip the emulated instruction. 4593 */ 4594 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4595 4596 /* 4597 * Restore L1's host state to KVM's software model. We're here 4598 * because a consistency check was caught by hardware, which 4599 * means some amount of guest state has been propagated to KVM's 4600 * model and needs to be unwound to the host's state. 4601 */ 4602 nested_vmx_restore_host_state(vcpu); 4603 4604 vmx->fail = 0; 4605 } 4606 4607 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4608 { 4609 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4610 } 4611 4612 /* 4613 * Decode the memory-address operand of a vmx instruction, as recorded on an 4614 * exit caused by such an instruction (run by a guest hypervisor). 4615 * On success, returns 0. When the operand is invalid, returns 1 and throws 4616 * #UD, #GP, or #SS. 4617 */ 4618 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4619 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4620 { 4621 gva_t off; 4622 bool exn; 4623 struct kvm_segment s; 4624 4625 /* 4626 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4627 * Execution", on an exit, vmx_instruction_info holds most of the 4628 * addressing components of the operand. Only the displacement part 4629 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4630 * For how an actual address is calculated from all these components, 4631 * refer to Vol. 1, "Operand Addressing". 4632 */ 4633 int scaling = vmx_instruction_info & 3; 4634 int addr_size = (vmx_instruction_info >> 7) & 7; 4635 bool is_reg = vmx_instruction_info & (1u << 10); 4636 int seg_reg = (vmx_instruction_info >> 15) & 7; 4637 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4638 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4639 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4640 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4641 4642 if (is_reg) { 4643 kvm_queue_exception(vcpu, UD_VECTOR); 4644 return 1; 4645 } 4646 4647 /* Addr = segment_base + offset */ 4648 /* offset = base + [index * scale] + displacement */ 4649 off = exit_qualification; /* holds the displacement */ 4650 if (addr_size == 1) 4651 off = (gva_t)sign_extend64(off, 31); 4652 else if (addr_size == 0) 4653 off = (gva_t)sign_extend64(off, 15); 4654 if (base_is_valid) 4655 off += kvm_register_read(vcpu, base_reg); 4656 if (index_is_valid) 4657 off += kvm_register_read(vcpu, index_reg) << scaling; 4658 vmx_get_segment(vcpu, &s, seg_reg); 4659 4660 /* 4661 * The effective address, i.e. @off, of a memory operand is truncated 4662 * based on the address size of the instruction. Note that this is 4663 * the *effective address*, i.e. the address prior to accounting for 4664 * the segment's base. 4665 */ 4666 if (addr_size == 1) /* 32 bit */ 4667 off &= 0xffffffff; 4668 else if (addr_size == 0) /* 16 bit */ 4669 off &= 0xffff; 4670 4671 /* Checks for #GP/#SS exceptions. */ 4672 exn = false; 4673 if (is_long_mode(vcpu)) { 4674 /* 4675 * The virtual/linear address is never truncated in 64-bit 4676 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4677 * address when using FS/GS with a non-zero base. 4678 */ 4679 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4680 *ret = s.base + off; 4681 else 4682 *ret = off; 4683 4684 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4685 * non-canonical form. This is the only check on the memory 4686 * destination for long mode! 4687 */ 4688 exn = is_noncanonical_address(*ret, vcpu); 4689 } else { 4690 /* 4691 * When not in long mode, the virtual/linear address is 4692 * unconditionally truncated to 32 bits regardless of the 4693 * address size. 4694 */ 4695 *ret = (s.base + off) & 0xffffffff; 4696 4697 /* Protected mode: apply checks for segment validity in the 4698 * following order: 4699 * - segment type check (#GP(0) may be thrown) 4700 * - usability check (#GP(0)/#SS(0)) 4701 * - limit check (#GP(0)/#SS(0)) 4702 */ 4703 if (wr) 4704 /* #GP(0) if the destination operand is located in a 4705 * read-only data segment or any code segment. 4706 */ 4707 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4708 else 4709 /* #GP(0) if the source operand is located in an 4710 * execute-only code segment 4711 */ 4712 exn = ((s.type & 0xa) == 8); 4713 if (exn) { 4714 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4715 return 1; 4716 } 4717 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4718 */ 4719 exn = (s.unusable != 0); 4720 4721 /* 4722 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4723 * outside the segment limit. All CPUs that support VMX ignore 4724 * limit checks for flat segments, i.e. segments with base==0, 4725 * limit==0xffffffff and of type expand-up data or code. 4726 */ 4727 if (!(s.base == 0 && s.limit == 0xffffffff && 4728 ((s.type & 8) || !(s.type & 4)))) 4729 exn = exn || ((u64)off + len - 1 > s.limit); 4730 } 4731 if (exn) { 4732 kvm_queue_exception_e(vcpu, 4733 seg_reg == VCPU_SREG_SS ? 4734 SS_VECTOR : GP_VECTOR, 4735 0); 4736 return 1; 4737 } 4738 4739 return 0; 4740 } 4741 4742 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 4743 { 4744 struct vcpu_vmx *vmx; 4745 4746 if (!nested_vmx_allowed(vcpu)) 4747 return; 4748 4749 vmx = to_vmx(vcpu); 4750 if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { 4751 vmx->nested.msrs.entry_ctls_high |= 4752 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4753 vmx->nested.msrs.exit_ctls_high |= 4754 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4755 } else { 4756 vmx->nested.msrs.entry_ctls_high &= 4757 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4758 vmx->nested.msrs.exit_ctls_high &= 4759 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4760 } 4761 } 4762 4763 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 4764 int *ret) 4765 { 4766 gva_t gva; 4767 struct x86_exception e; 4768 int r; 4769 4770 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 4771 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4772 sizeof(*vmpointer), &gva)) { 4773 *ret = 1; 4774 return -EINVAL; 4775 } 4776 4777 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 4778 if (r != X86EMUL_CONTINUE) { 4779 *ret = kvm_handle_memory_failure(vcpu, r, &e); 4780 return -EINVAL; 4781 } 4782 4783 return 0; 4784 } 4785 4786 /* 4787 * Allocate a shadow VMCS and associate it with the currently loaded 4788 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4789 * VMCS is also VMCLEARed, so that it is ready for use. 4790 */ 4791 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4792 { 4793 struct vcpu_vmx *vmx = to_vmx(vcpu); 4794 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4795 4796 /* 4797 * We should allocate a shadow vmcs for vmcs01 only when L1 4798 * executes VMXON and free it when L1 executes VMXOFF. 4799 * As it is invalid to execute VMXON twice, we shouldn't reach 4800 * here when vmcs01 already have an allocated shadow vmcs. 4801 */ 4802 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4803 4804 if (!loaded_vmcs->shadow_vmcs) { 4805 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4806 if (loaded_vmcs->shadow_vmcs) 4807 vmcs_clear(loaded_vmcs->shadow_vmcs); 4808 } 4809 return loaded_vmcs->shadow_vmcs; 4810 } 4811 4812 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4813 { 4814 struct vcpu_vmx *vmx = to_vmx(vcpu); 4815 int r; 4816 4817 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4818 if (r < 0) 4819 goto out_vmcs02; 4820 4821 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4822 if (!vmx->nested.cached_vmcs12) 4823 goto out_cached_vmcs12; 4824 4825 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4826 if (!vmx->nested.cached_shadow_vmcs12) 4827 goto out_cached_shadow_vmcs12; 4828 4829 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4830 goto out_shadow_vmcs; 4831 4832 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4833 HRTIMER_MODE_ABS_PINNED); 4834 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4835 4836 vmx->nested.vpid02 = allocate_vpid(); 4837 4838 vmx->nested.vmcs02_initialized = false; 4839 vmx->nested.vmxon = true; 4840 4841 if (vmx_pt_mode_is_host_guest()) { 4842 vmx->pt_desc.guest.ctl = 0; 4843 pt_update_intercept_for_msr(vcpu); 4844 } 4845 4846 return 0; 4847 4848 out_shadow_vmcs: 4849 kfree(vmx->nested.cached_shadow_vmcs12); 4850 4851 out_cached_shadow_vmcs12: 4852 kfree(vmx->nested.cached_vmcs12); 4853 4854 out_cached_vmcs12: 4855 free_loaded_vmcs(&vmx->nested.vmcs02); 4856 4857 out_vmcs02: 4858 return -ENOMEM; 4859 } 4860 4861 /* 4862 * Emulate the VMXON instruction. 4863 * Currently, we just remember that VMX is active, and do not save or even 4864 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4865 * do not currently need to store anything in that guest-allocated memory 4866 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4867 * argument is different from the VMXON pointer (which the spec says they do). 4868 */ 4869 static int handle_vmon(struct kvm_vcpu *vcpu) 4870 { 4871 int ret; 4872 gpa_t vmptr; 4873 uint32_t revision; 4874 struct vcpu_vmx *vmx = to_vmx(vcpu); 4875 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 4876 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 4877 4878 /* 4879 * The Intel VMX Instruction Reference lists a bunch of bits that are 4880 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4881 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). 4882 * Otherwise, we should fail with #UD. But most faulting conditions 4883 * have already been checked by hardware, prior to the VM-exit for 4884 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4885 * that bit set to 1 in non-root mode. 4886 */ 4887 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4888 kvm_queue_exception(vcpu, UD_VECTOR); 4889 return 1; 4890 } 4891 4892 /* CPL=0 must be checked manually. */ 4893 if (vmx_get_cpl(vcpu)) { 4894 kvm_inject_gp(vcpu, 0); 4895 return 1; 4896 } 4897 4898 if (vmx->nested.vmxon) 4899 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4900 4901 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4902 != VMXON_NEEDED_FEATURES) { 4903 kvm_inject_gp(vcpu, 0); 4904 return 1; 4905 } 4906 4907 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 4908 return ret; 4909 4910 /* 4911 * SDM 3: 24.11.5 4912 * The first 4 bytes of VMXON region contain the supported 4913 * VMCS revision identifier 4914 * 4915 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4916 * which replaces physical address width with 32 4917 */ 4918 if (!page_address_valid(vcpu, vmptr)) 4919 return nested_vmx_failInvalid(vcpu); 4920 4921 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4922 revision != VMCS12_REVISION) 4923 return nested_vmx_failInvalid(vcpu); 4924 4925 vmx->nested.vmxon_ptr = vmptr; 4926 ret = enter_vmx_operation(vcpu); 4927 if (ret) 4928 return ret; 4929 4930 return nested_vmx_succeed(vcpu); 4931 } 4932 4933 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4934 { 4935 struct vcpu_vmx *vmx = to_vmx(vcpu); 4936 4937 if (vmx->nested.current_vmptr == -1ull) 4938 return; 4939 4940 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4941 4942 if (enable_shadow_vmcs) { 4943 /* copy to memory all shadowed fields in case 4944 they were modified */ 4945 copy_shadow_to_vmcs12(vmx); 4946 vmx_disable_shadow_vmcs(vmx); 4947 } 4948 vmx->nested.posted_intr_nv = -1; 4949 4950 /* Flush VMCS12 to guest memory */ 4951 kvm_vcpu_write_guest_page(vcpu, 4952 vmx->nested.current_vmptr >> PAGE_SHIFT, 4953 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4954 4955 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4956 4957 vmx->nested.current_vmptr = -1ull; 4958 } 4959 4960 /* Emulate the VMXOFF instruction */ 4961 static int handle_vmoff(struct kvm_vcpu *vcpu) 4962 { 4963 if (!nested_vmx_check_permission(vcpu)) 4964 return 1; 4965 4966 free_nested(vcpu); 4967 4968 /* Process a latched INIT during time CPU was in VMX operation */ 4969 kvm_make_request(KVM_REQ_EVENT, vcpu); 4970 4971 return nested_vmx_succeed(vcpu); 4972 } 4973 4974 /* Emulate the VMCLEAR instruction */ 4975 static int handle_vmclear(struct kvm_vcpu *vcpu) 4976 { 4977 struct vcpu_vmx *vmx = to_vmx(vcpu); 4978 u32 zero = 0; 4979 gpa_t vmptr; 4980 u64 evmcs_gpa; 4981 int r; 4982 4983 if (!nested_vmx_check_permission(vcpu)) 4984 return 1; 4985 4986 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 4987 return r; 4988 4989 if (!page_address_valid(vcpu, vmptr)) 4990 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 4991 4992 if (vmptr == vmx->nested.vmxon_ptr) 4993 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 4994 4995 /* 4996 * When Enlightened VMEntry is enabled on the calling CPU we treat 4997 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 4998 * way to distinguish it from VMCS12) and we must not corrupt it by 4999 * writing to the non-existent 'launch_state' field. The area doesn't 5000 * have to be the currently active EVMCS on the calling CPU and there's 5001 * nothing KVM has to do to transition it from 'active' to 'non-active' 5002 * state. It is possible that the area will stay mapped as 5003 * vmx->nested.hv_evmcs but this shouldn't be a problem. 5004 */ 5005 if (likely(!vmx->nested.enlightened_vmcs_enabled || 5006 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 5007 if (vmptr == vmx->nested.current_vmptr) 5008 nested_release_vmcs12(vcpu); 5009 5010 kvm_vcpu_write_guest(vcpu, 5011 vmptr + offsetof(struct vmcs12, 5012 launch_state), 5013 &zero, sizeof(zero)); 5014 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { 5015 nested_release_evmcs(vcpu); 5016 } 5017 5018 return nested_vmx_succeed(vcpu); 5019 } 5020 5021 /* Emulate the VMLAUNCH instruction */ 5022 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5023 { 5024 return nested_vmx_run(vcpu, true); 5025 } 5026 5027 /* Emulate the VMRESUME instruction */ 5028 static int handle_vmresume(struct kvm_vcpu *vcpu) 5029 { 5030 5031 return nested_vmx_run(vcpu, false); 5032 } 5033 5034 static int handle_vmread(struct kvm_vcpu *vcpu) 5035 { 5036 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5037 : get_vmcs12(vcpu); 5038 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5039 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5040 struct vcpu_vmx *vmx = to_vmx(vcpu); 5041 struct x86_exception e; 5042 unsigned long field; 5043 u64 value; 5044 gva_t gva = 0; 5045 short offset; 5046 int len, r; 5047 5048 if (!nested_vmx_check_permission(vcpu)) 5049 return 1; 5050 5051 /* 5052 * In VMX non-root operation, when the VMCS-link pointer is -1ull, 5053 * any VMREAD sets the ALU flags for VMfailInvalid. 5054 */ 5055 if (vmx->nested.current_vmptr == -1ull || 5056 (is_guest_mode(vcpu) && 5057 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 5058 return nested_vmx_failInvalid(vcpu); 5059 5060 /* Decode instruction info and find the field to read */ 5061 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5062 5063 offset = vmcs_field_to_offset(field); 5064 if (offset < 0) 5065 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5066 5067 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5068 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5069 5070 /* Read the field, zero-extended to a u64 value */ 5071 value = vmcs12_read_any(vmcs12, field, offset); 5072 5073 /* 5074 * Now copy part of this value to register or memory, as requested. 5075 * Note that the number of bits actually copied is 32 or 64 depending 5076 * on the guest's mode (32 or 64 bit), not on the given field's length. 5077 */ 5078 if (instr_info & BIT(10)) { 5079 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5080 } else { 5081 len = is_64_bit_mode(vcpu) ? 8 : 4; 5082 if (get_vmx_mem_address(vcpu, exit_qualification, 5083 instr_info, true, len, &gva)) 5084 return 1; 5085 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5086 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5087 if (r != X86EMUL_CONTINUE) 5088 return kvm_handle_memory_failure(vcpu, r, &e); 5089 } 5090 5091 return nested_vmx_succeed(vcpu); 5092 } 5093 5094 static bool is_shadow_field_rw(unsigned long field) 5095 { 5096 switch (field) { 5097 #define SHADOW_FIELD_RW(x, y) case x: 5098 #include "vmcs_shadow_fields.h" 5099 return true; 5100 default: 5101 break; 5102 } 5103 return false; 5104 } 5105 5106 static bool is_shadow_field_ro(unsigned long field) 5107 { 5108 switch (field) { 5109 #define SHADOW_FIELD_RO(x, y) case x: 5110 #include "vmcs_shadow_fields.h" 5111 return true; 5112 default: 5113 break; 5114 } 5115 return false; 5116 } 5117 5118 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5119 { 5120 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5121 : get_vmcs12(vcpu); 5122 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5123 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5124 struct vcpu_vmx *vmx = to_vmx(vcpu); 5125 struct x86_exception e; 5126 unsigned long field; 5127 short offset; 5128 gva_t gva; 5129 int len, r; 5130 5131 /* 5132 * The value to write might be 32 or 64 bits, depending on L1's long 5133 * mode, and eventually we need to write that into a field of several 5134 * possible lengths. The code below first zero-extends the value to 64 5135 * bit (value), and then copies only the appropriate number of 5136 * bits into the vmcs12 field. 5137 */ 5138 u64 value = 0; 5139 5140 if (!nested_vmx_check_permission(vcpu)) 5141 return 1; 5142 5143 /* 5144 * In VMX non-root operation, when the VMCS-link pointer is -1ull, 5145 * any VMWRITE sets the ALU flags for VMfailInvalid. 5146 */ 5147 if (vmx->nested.current_vmptr == -1ull || 5148 (is_guest_mode(vcpu) && 5149 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 5150 return nested_vmx_failInvalid(vcpu); 5151 5152 if (instr_info & BIT(10)) 5153 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5154 else { 5155 len = is_64_bit_mode(vcpu) ? 8 : 4; 5156 if (get_vmx_mem_address(vcpu, exit_qualification, 5157 instr_info, false, len, &gva)) 5158 return 1; 5159 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5160 if (r != X86EMUL_CONTINUE) 5161 return kvm_handle_memory_failure(vcpu, r, &e); 5162 } 5163 5164 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5165 5166 offset = vmcs_field_to_offset(field); 5167 if (offset < 0) 5168 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5169 5170 /* 5171 * If the vCPU supports "VMWRITE to any supported field in the 5172 * VMCS," then the "read-only" fields are actually read/write. 5173 */ 5174 if (vmcs_field_readonly(field) && 5175 !nested_cpu_has_vmwrite_any_field(vcpu)) 5176 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5177 5178 /* 5179 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5180 * vmcs12, else we may crush a field or consume a stale value. 5181 */ 5182 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5183 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5184 5185 /* 5186 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5187 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5188 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5189 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5190 * from L1 will return a different value than VMREAD from L2 (L1 sees 5191 * the stripped down value, L2 sees the full value as stored by KVM). 5192 */ 5193 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5194 value &= 0x1f0ff; 5195 5196 vmcs12_write_any(vmcs12, field, offset, value); 5197 5198 /* 5199 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5200 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5201 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5202 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5203 */ 5204 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5205 /* 5206 * L1 can read these fields without exiting, ensure the 5207 * shadow VMCS is up-to-date. 5208 */ 5209 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5210 preempt_disable(); 5211 vmcs_load(vmx->vmcs01.shadow_vmcs); 5212 5213 __vmcs_writel(field, value); 5214 5215 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5216 vmcs_load(vmx->loaded_vmcs->vmcs); 5217 preempt_enable(); 5218 } 5219 vmx->nested.dirty_vmcs12 = true; 5220 } 5221 5222 return nested_vmx_succeed(vcpu); 5223 } 5224 5225 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5226 { 5227 vmx->nested.current_vmptr = vmptr; 5228 if (enable_shadow_vmcs) { 5229 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5230 vmcs_write64(VMCS_LINK_POINTER, 5231 __pa(vmx->vmcs01.shadow_vmcs)); 5232 vmx->nested.need_vmcs12_to_shadow_sync = true; 5233 } 5234 vmx->nested.dirty_vmcs12 = true; 5235 } 5236 5237 /* Emulate the VMPTRLD instruction */ 5238 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5239 { 5240 struct vcpu_vmx *vmx = to_vmx(vcpu); 5241 gpa_t vmptr; 5242 int r; 5243 5244 if (!nested_vmx_check_permission(vcpu)) 5245 return 1; 5246 5247 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5248 return r; 5249 5250 if (!page_address_valid(vcpu, vmptr)) 5251 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5252 5253 if (vmptr == vmx->nested.vmxon_ptr) 5254 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5255 5256 /* Forbid normal VMPTRLD if Enlightened version was used */ 5257 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 5258 return 1; 5259 5260 if (vmx->nested.current_vmptr != vmptr) { 5261 struct kvm_host_map map; 5262 struct vmcs12 *new_vmcs12; 5263 5264 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 5265 /* 5266 * Reads from an unbacked page return all 1s, 5267 * which means that the 32 bits located at the 5268 * given physical address won't match the required 5269 * VMCS12_REVISION identifier. 5270 */ 5271 return nested_vmx_fail(vcpu, 5272 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5273 } 5274 5275 new_vmcs12 = map.hva; 5276 5277 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 5278 (new_vmcs12->hdr.shadow_vmcs && 5279 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5280 kvm_vcpu_unmap(vcpu, &map, false); 5281 return nested_vmx_fail(vcpu, 5282 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5283 } 5284 5285 nested_release_vmcs12(vcpu); 5286 5287 /* 5288 * Load VMCS12 from guest memory since it is not already 5289 * cached. 5290 */ 5291 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 5292 kvm_vcpu_unmap(vcpu, &map, false); 5293 5294 set_current_vmptr(vmx, vmptr); 5295 } 5296 5297 return nested_vmx_succeed(vcpu); 5298 } 5299 5300 /* Emulate the VMPTRST instruction */ 5301 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5302 { 5303 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5304 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5305 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5306 struct x86_exception e; 5307 gva_t gva; 5308 int r; 5309 5310 if (!nested_vmx_check_permission(vcpu)) 5311 return 1; 5312 5313 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) 5314 return 1; 5315 5316 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5317 true, sizeof(gpa_t), &gva)) 5318 return 1; 5319 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5320 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5321 sizeof(gpa_t), &e); 5322 if (r != X86EMUL_CONTINUE) 5323 return kvm_handle_memory_failure(vcpu, r, &e); 5324 5325 return nested_vmx_succeed(vcpu); 5326 } 5327 5328 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 5329 5330 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 5331 { 5332 return VALID_PAGE(root_hpa) && 5333 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 5334 } 5335 5336 /* Emulate the INVEPT instruction */ 5337 static int handle_invept(struct kvm_vcpu *vcpu) 5338 { 5339 struct vcpu_vmx *vmx = to_vmx(vcpu); 5340 u32 vmx_instruction_info, types; 5341 unsigned long type, roots_to_free; 5342 struct kvm_mmu *mmu; 5343 gva_t gva; 5344 struct x86_exception e; 5345 struct { 5346 u64 eptp, gpa; 5347 } operand; 5348 int i, r; 5349 5350 if (!(vmx->nested.msrs.secondary_ctls_high & 5351 SECONDARY_EXEC_ENABLE_EPT) || 5352 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5353 kvm_queue_exception(vcpu, UD_VECTOR); 5354 return 1; 5355 } 5356 5357 if (!nested_vmx_check_permission(vcpu)) 5358 return 1; 5359 5360 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5361 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); 5362 5363 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5364 5365 if (type >= 32 || !(types & (1 << type))) 5366 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5367 5368 /* According to the Intel VMX instruction reference, the memory 5369 * operand is read even if it isn't needed (e.g., for type==global) 5370 */ 5371 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5372 vmx_instruction_info, false, sizeof(operand), &gva)) 5373 return 1; 5374 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5375 if (r != X86EMUL_CONTINUE) 5376 return kvm_handle_memory_failure(vcpu, r, &e); 5377 5378 /* 5379 * Nested EPT roots are always held through guest_mmu, 5380 * not root_mmu. 5381 */ 5382 mmu = &vcpu->arch.guest_mmu; 5383 5384 switch (type) { 5385 case VMX_EPT_EXTENT_CONTEXT: 5386 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5387 return nested_vmx_fail(vcpu, 5388 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5389 5390 roots_to_free = 0; 5391 if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, 5392 operand.eptp)) 5393 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5394 5395 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5396 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5397 mmu->prev_roots[i].pgd, 5398 operand.eptp)) 5399 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5400 } 5401 break; 5402 case VMX_EPT_EXTENT_GLOBAL: 5403 roots_to_free = KVM_MMU_ROOTS_ALL; 5404 break; 5405 default: 5406 BUG(); 5407 break; 5408 } 5409 5410 if (roots_to_free) 5411 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 5412 5413 return nested_vmx_succeed(vcpu); 5414 } 5415 5416 static int handle_invvpid(struct kvm_vcpu *vcpu) 5417 { 5418 struct vcpu_vmx *vmx = to_vmx(vcpu); 5419 u32 vmx_instruction_info; 5420 unsigned long type, types; 5421 gva_t gva; 5422 struct x86_exception e; 5423 struct { 5424 u64 vpid; 5425 u64 gla; 5426 } operand; 5427 u16 vpid02; 5428 int r; 5429 5430 if (!(vmx->nested.msrs.secondary_ctls_high & 5431 SECONDARY_EXEC_ENABLE_VPID) || 5432 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5433 kvm_queue_exception(vcpu, UD_VECTOR); 5434 return 1; 5435 } 5436 5437 if (!nested_vmx_check_permission(vcpu)) 5438 return 1; 5439 5440 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5441 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); 5442 5443 types = (vmx->nested.msrs.vpid_caps & 5444 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5445 5446 if (type >= 32 || !(types & (1 << type))) 5447 return nested_vmx_fail(vcpu, 5448 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5449 5450 /* according to the intel vmx instruction reference, the memory 5451 * operand is read even if it isn't needed (e.g., for type==global) 5452 */ 5453 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5454 vmx_instruction_info, false, sizeof(operand), &gva)) 5455 return 1; 5456 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5457 if (r != X86EMUL_CONTINUE) 5458 return kvm_handle_memory_failure(vcpu, r, &e); 5459 5460 if (operand.vpid >> 16) 5461 return nested_vmx_fail(vcpu, 5462 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5463 5464 vpid02 = nested_get_vpid02(vcpu); 5465 switch (type) { 5466 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5467 if (!operand.vpid || 5468 is_noncanonical_address(operand.gla, vcpu)) 5469 return nested_vmx_fail(vcpu, 5470 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5471 vpid_sync_vcpu_addr(vpid02, operand.gla); 5472 break; 5473 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5474 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5475 if (!operand.vpid) 5476 return nested_vmx_fail(vcpu, 5477 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5478 vpid_sync_context(vpid02); 5479 break; 5480 case VMX_VPID_EXTENT_ALL_CONTEXT: 5481 vpid_sync_context(vpid02); 5482 break; 5483 default: 5484 WARN_ON_ONCE(1); 5485 return kvm_skip_emulated_instruction(vcpu); 5486 } 5487 5488 /* 5489 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5490 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5491 * roots as VPIDs are not tracked in the MMU role. 5492 * 5493 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5494 * an MMU when EPT is disabled. 5495 * 5496 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5497 */ 5498 if (!enable_ept) 5499 kvm_mmu_free_guest_mode_roots(vcpu, &vcpu->arch.root_mmu); 5500 5501 return nested_vmx_succeed(vcpu); 5502 } 5503 5504 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5505 struct vmcs12 *vmcs12) 5506 { 5507 u32 index = kvm_rcx_read(vcpu); 5508 u64 new_eptp; 5509 5510 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5511 return 1; 5512 if (index >= VMFUNC_EPTP_ENTRIES) 5513 return 1; 5514 5515 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5516 &new_eptp, index * 8, 8)) 5517 return 1; 5518 5519 /* 5520 * If the (L2) guest does a vmfunc to the currently 5521 * active ept pointer, we don't have to do anything else 5522 */ 5523 if (vmcs12->ept_pointer != new_eptp) { 5524 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5525 return 1; 5526 5527 vmcs12->ept_pointer = new_eptp; 5528 nested_ept_new_eptp(vcpu); 5529 5530 if (!nested_cpu_has_vpid(vmcs12)) 5531 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5532 } 5533 5534 return 0; 5535 } 5536 5537 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5538 { 5539 struct vcpu_vmx *vmx = to_vmx(vcpu); 5540 struct vmcs12 *vmcs12; 5541 u32 function = kvm_rax_read(vcpu); 5542 5543 /* 5544 * VMFUNC is only supported for nested guests, but we always enable the 5545 * secondary control for simplicity; for non-nested mode, fake that we 5546 * didn't by injecting #UD. 5547 */ 5548 if (!is_guest_mode(vcpu)) { 5549 kvm_queue_exception(vcpu, UD_VECTOR); 5550 return 1; 5551 } 5552 5553 vmcs12 = get_vmcs12(vcpu); 5554 5555 /* 5556 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5557 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5558 */ 5559 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5560 kvm_queue_exception(vcpu, UD_VECTOR); 5561 return 1; 5562 } 5563 5564 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5565 goto fail; 5566 5567 switch (function) { 5568 case 0: 5569 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5570 goto fail; 5571 break; 5572 default: 5573 goto fail; 5574 } 5575 return kvm_skip_emulated_instruction(vcpu); 5576 5577 fail: 5578 /* 5579 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5580 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5581 * EXIT_REASON_VMFUNC as the exit reason. 5582 */ 5583 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5584 vmx_get_intr_info(vcpu), 5585 vmx_get_exit_qual(vcpu)); 5586 return 1; 5587 } 5588 5589 /* 5590 * Return true if an IO instruction with the specified port and size should cause 5591 * a VM-exit into L1. 5592 */ 5593 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5594 int size) 5595 { 5596 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5597 gpa_t bitmap, last_bitmap; 5598 u8 b; 5599 5600 last_bitmap = (gpa_t)-1; 5601 b = -1; 5602 5603 while (size > 0) { 5604 if (port < 0x8000) 5605 bitmap = vmcs12->io_bitmap_a; 5606 else if (port < 0x10000) 5607 bitmap = vmcs12->io_bitmap_b; 5608 else 5609 return true; 5610 bitmap += (port & 0x7fff) / 8; 5611 5612 if (last_bitmap != bitmap) 5613 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5614 return true; 5615 if (b & (1 << (port & 7))) 5616 return true; 5617 5618 port++; 5619 size--; 5620 last_bitmap = bitmap; 5621 } 5622 5623 return false; 5624 } 5625 5626 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5627 struct vmcs12 *vmcs12) 5628 { 5629 unsigned long exit_qualification; 5630 unsigned short port; 5631 int size; 5632 5633 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5634 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5635 5636 exit_qualification = vmx_get_exit_qual(vcpu); 5637 5638 port = exit_qualification >> 16; 5639 size = (exit_qualification & 7) + 1; 5640 5641 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5642 } 5643 5644 /* 5645 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5646 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5647 * disinterest in the current event (read or write a specific MSR) by using an 5648 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5649 */ 5650 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5651 struct vmcs12 *vmcs12, 5652 union vmx_exit_reason exit_reason) 5653 { 5654 u32 msr_index = kvm_rcx_read(vcpu); 5655 gpa_t bitmap; 5656 5657 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5658 return true; 5659 5660 /* 5661 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5662 * for the four combinations of read/write and low/high MSR numbers. 5663 * First we need to figure out which of the four to use: 5664 */ 5665 bitmap = vmcs12->msr_bitmap; 5666 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5667 bitmap += 2048; 5668 if (msr_index >= 0xc0000000) { 5669 msr_index -= 0xc0000000; 5670 bitmap += 1024; 5671 } 5672 5673 /* Then read the msr_index'th bit from this bitmap: */ 5674 if (msr_index < 1024*8) { 5675 unsigned char b; 5676 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5677 return true; 5678 return 1 & (b >> (msr_index & 7)); 5679 } else 5680 return true; /* let L1 handle the wrong parameter */ 5681 } 5682 5683 /* 5684 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5685 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5686 * intercept (via guest_host_mask etc.) the current event. 5687 */ 5688 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5689 struct vmcs12 *vmcs12) 5690 { 5691 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5692 int cr = exit_qualification & 15; 5693 int reg; 5694 unsigned long val; 5695 5696 switch ((exit_qualification >> 4) & 3) { 5697 case 0: /* mov to cr */ 5698 reg = (exit_qualification >> 8) & 15; 5699 val = kvm_register_read(vcpu, reg); 5700 switch (cr) { 5701 case 0: 5702 if (vmcs12->cr0_guest_host_mask & 5703 (val ^ vmcs12->cr0_read_shadow)) 5704 return true; 5705 break; 5706 case 3: 5707 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5708 return true; 5709 break; 5710 case 4: 5711 if (vmcs12->cr4_guest_host_mask & 5712 (vmcs12->cr4_read_shadow ^ val)) 5713 return true; 5714 break; 5715 case 8: 5716 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5717 return true; 5718 break; 5719 } 5720 break; 5721 case 2: /* clts */ 5722 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5723 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5724 return true; 5725 break; 5726 case 1: /* mov from cr */ 5727 switch (cr) { 5728 case 3: 5729 if (vmcs12->cpu_based_vm_exec_control & 5730 CPU_BASED_CR3_STORE_EXITING) 5731 return true; 5732 break; 5733 case 8: 5734 if (vmcs12->cpu_based_vm_exec_control & 5735 CPU_BASED_CR8_STORE_EXITING) 5736 return true; 5737 break; 5738 } 5739 break; 5740 case 3: /* lmsw */ 5741 /* 5742 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5743 * cr0. Other attempted changes are ignored, with no exit. 5744 */ 5745 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5746 if (vmcs12->cr0_guest_host_mask & 0xe & 5747 (val ^ vmcs12->cr0_read_shadow)) 5748 return true; 5749 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5750 !(vmcs12->cr0_read_shadow & 0x1) && 5751 (val & 0x1)) 5752 return true; 5753 break; 5754 } 5755 return false; 5756 } 5757 5758 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 5759 struct vmcs12 *vmcs12) 5760 { 5761 u32 encls_leaf; 5762 5763 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 5764 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 5765 return false; 5766 5767 encls_leaf = kvm_rax_read(vcpu); 5768 if (encls_leaf > 62) 5769 encls_leaf = 63; 5770 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 5771 } 5772 5773 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5774 struct vmcs12 *vmcs12, gpa_t bitmap) 5775 { 5776 u32 vmx_instruction_info; 5777 unsigned long field; 5778 u8 b; 5779 5780 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5781 return true; 5782 5783 /* Decode instruction info and find the field to access */ 5784 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5785 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5786 5787 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5788 if (field >> 15) 5789 return true; 5790 5791 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5792 return true; 5793 5794 return 1 & (b >> (field & 7)); 5795 } 5796 5797 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 5798 { 5799 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 5800 5801 if (nested_cpu_has_mtf(vmcs12)) 5802 return true; 5803 5804 /* 5805 * An MTF VM-exit may be injected into the guest by setting the 5806 * interruption-type to 7 (other event) and the vector field to 0. Such 5807 * is the case regardless of the 'monitor trap flag' VM-execution 5808 * control. 5809 */ 5810 return entry_intr_info == (INTR_INFO_VALID_MASK 5811 | INTR_TYPE_OTHER_EVENT); 5812 } 5813 5814 /* 5815 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 5816 * L1 wants the exit. Only call this when in is_guest_mode (L2). 5817 */ 5818 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 5819 union vmx_exit_reason exit_reason) 5820 { 5821 u32 intr_info; 5822 5823 switch ((u16)exit_reason.basic) { 5824 case EXIT_REASON_EXCEPTION_NMI: 5825 intr_info = vmx_get_intr_info(vcpu); 5826 if (is_nmi(intr_info)) 5827 return true; 5828 else if (is_page_fault(intr_info)) 5829 return vcpu->arch.apf.host_apf_flags || !enable_ept; 5830 else if (is_debug(intr_info) && 5831 vcpu->guest_debug & 5832 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5833 return true; 5834 else if (is_breakpoint(intr_info) && 5835 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5836 return true; 5837 else if (is_alignment_check(intr_info) && 5838 !vmx_guest_inject_ac(vcpu)) 5839 return true; 5840 return false; 5841 case EXIT_REASON_EXTERNAL_INTERRUPT: 5842 return true; 5843 case EXIT_REASON_MCE_DURING_VMENTRY: 5844 return true; 5845 case EXIT_REASON_EPT_VIOLATION: 5846 /* 5847 * L0 always deals with the EPT violation. If nested EPT is 5848 * used, and the nested mmu code discovers that the address is 5849 * missing in the guest EPT table (EPT12), the EPT violation 5850 * will be injected with nested_ept_inject_page_fault() 5851 */ 5852 return true; 5853 case EXIT_REASON_EPT_MISCONFIG: 5854 /* 5855 * L2 never uses directly L1's EPT, but rather L0's own EPT 5856 * table (shadow on EPT) or a merged EPT table that L0 built 5857 * (EPT on EPT). So any problems with the structure of the 5858 * table is L0's fault. 5859 */ 5860 return true; 5861 case EXIT_REASON_PREEMPTION_TIMER: 5862 return true; 5863 case EXIT_REASON_PML_FULL: 5864 /* 5865 * PML is emulated for an L1 VMM and should never be enabled in 5866 * vmcs02, always "handle" PML_FULL by exiting to userspace. 5867 */ 5868 return true; 5869 case EXIT_REASON_VMFUNC: 5870 /* VM functions are emulated through L2->L0 vmexits. */ 5871 return true; 5872 default: 5873 break; 5874 } 5875 return false; 5876 } 5877 5878 /* 5879 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 5880 * is_guest_mode (L2). 5881 */ 5882 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 5883 union vmx_exit_reason exit_reason) 5884 { 5885 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5886 u32 intr_info; 5887 5888 switch ((u16)exit_reason.basic) { 5889 case EXIT_REASON_EXCEPTION_NMI: 5890 intr_info = vmx_get_intr_info(vcpu); 5891 if (is_nmi(intr_info)) 5892 return true; 5893 else if (is_page_fault(intr_info)) 5894 return true; 5895 return vmcs12->exception_bitmap & 5896 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5897 case EXIT_REASON_EXTERNAL_INTERRUPT: 5898 return nested_exit_on_intr(vcpu); 5899 case EXIT_REASON_TRIPLE_FAULT: 5900 return true; 5901 case EXIT_REASON_INTERRUPT_WINDOW: 5902 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 5903 case EXIT_REASON_NMI_WINDOW: 5904 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 5905 case EXIT_REASON_TASK_SWITCH: 5906 return true; 5907 case EXIT_REASON_CPUID: 5908 return true; 5909 case EXIT_REASON_HLT: 5910 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5911 case EXIT_REASON_INVD: 5912 return true; 5913 case EXIT_REASON_INVLPG: 5914 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5915 case EXIT_REASON_RDPMC: 5916 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5917 case EXIT_REASON_RDRAND: 5918 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5919 case EXIT_REASON_RDSEED: 5920 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5921 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5922 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5923 case EXIT_REASON_VMREAD: 5924 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5925 vmcs12->vmread_bitmap); 5926 case EXIT_REASON_VMWRITE: 5927 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5928 vmcs12->vmwrite_bitmap); 5929 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5930 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5931 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5932 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5933 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5934 /* 5935 * VMX instructions trap unconditionally. This allows L1 to 5936 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5937 */ 5938 return true; 5939 case EXIT_REASON_CR_ACCESS: 5940 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5941 case EXIT_REASON_DR_ACCESS: 5942 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5943 case EXIT_REASON_IO_INSTRUCTION: 5944 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5945 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5946 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5947 case EXIT_REASON_MSR_READ: 5948 case EXIT_REASON_MSR_WRITE: 5949 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5950 case EXIT_REASON_INVALID_STATE: 5951 return true; 5952 case EXIT_REASON_MWAIT_INSTRUCTION: 5953 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5954 case EXIT_REASON_MONITOR_TRAP_FLAG: 5955 return nested_vmx_exit_handled_mtf(vmcs12); 5956 case EXIT_REASON_MONITOR_INSTRUCTION: 5957 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5958 case EXIT_REASON_PAUSE_INSTRUCTION: 5959 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5960 nested_cpu_has2(vmcs12, 5961 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5962 case EXIT_REASON_MCE_DURING_VMENTRY: 5963 return true; 5964 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5965 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5966 case EXIT_REASON_APIC_ACCESS: 5967 case EXIT_REASON_APIC_WRITE: 5968 case EXIT_REASON_EOI_INDUCED: 5969 /* 5970 * The controls for "virtualize APIC accesses," "APIC- 5971 * register virtualization," and "virtual-interrupt 5972 * delivery" only come from vmcs12. 5973 */ 5974 return true; 5975 case EXIT_REASON_INVPCID: 5976 return 5977 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5978 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5979 case EXIT_REASON_WBINVD: 5980 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5981 case EXIT_REASON_XSETBV: 5982 return true; 5983 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5984 /* 5985 * This should never happen, since it is not possible to 5986 * set XSS to a non-zero value---neither in L1 nor in L2. 5987 * If if it were, XSS would have to be checked against 5988 * the XSS exit bitmap in vmcs12. 5989 */ 5990 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5991 case EXIT_REASON_UMWAIT: 5992 case EXIT_REASON_TPAUSE: 5993 return nested_cpu_has2(vmcs12, 5994 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 5995 case EXIT_REASON_ENCLS: 5996 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 5997 default: 5998 return true; 5999 } 6000 } 6001 6002 /* 6003 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6004 * reflected into L1. 6005 */ 6006 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6007 { 6008 struct vcpu_vmx *vmx = to_vmx(vcpu); 6009 union vmx_exit_reason exit_reason = vmx->exit_reason; 6010 unsigned long exit_qual; 6011 u32 exit_intr_info; 6012 6013 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6014 6015 /* 6016 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6017 * has already loaded L2's state. 6018 */ 6019 if (unlikely(vmx->fail)) { 6020 trace_kvm_nested_vmenter_failed( 6021 "hardware VM-instruction error: ", 6022 vmcs_read32(VM_INSTRUCTION_ERROR)); 6023 exit_intr_info = 0; 6024 exit_qual = 0; 6025 goto reflect_vmexit; 6026 } 6027 6028 trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX); 6029 6030 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6031 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6032 return false; 6033 6034 /* If L1 doesn't want the exit, handle it in L0. */ 6035 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6036 return false; 6037 6038 /* 6039 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6040 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6041 * need to be synthesized by querying the in-kernel LAPIC, but external 6042 * interrupts are never reflected to L1 so it's a non-issue. 6043 */ 6044 exit_intr_info = vmx_get_intr_info(vcpu); 6045 if (is_exception_with_error_code(exit_intr_info)) { 6046 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6047 6048 vmcs12->vm_exit_intr_error_code = 6049 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6050 } 6051 exit_qual = vmx_get_exit_qual(vcpu); 6052 6053 reflect_vmexit: 6054 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6055 return true; 6056 } 6057 6058 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6059 struct kvm_nested_state __user *user_kvm_nested_state, 6060 u32 user_data_size) 6061 { 6062 struct vcpu_vmx *vmx; 6063 struct vmcs12 *vmcs12; 6064 struct kvm_nested_state kvm_state = { 6065 .flags = 0, 6066 .format = KVM_STATE_NESTED_FORMAT_VMX, 6067 .size = sizeof(kvm_state), 6068 .hdr.vmx.flags = 0, 6069 .hdr.vmx.vmxon_pa = -1ull, 6070 .hdr.vmx.vmcs12_pa = -1ull, 6071 .hdr.vmx.preemption_timer_deadline = 0, 6072 }; 6073 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6074 &user_kvm_nested_state->data.vmx[0]; 6075 6076 if (!vcpu) 6077 return kvm_state.size + sizeof(*user_vmx_nested_state); 6078 6079 vmx = to_vmx(vcpu); 6080 vmcs12 = get_vmcs12(vcpu); 6081 6082 if (nested_vmx_allowed(vcpu) && 6083 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6084 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6085 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6086 6087 if (vmx_has_valid_vmcs12(vcpu)) { 6088 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6089 6090 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6091 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 6092 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6093 6094 if (is_guest_mode(vcpu) && 6095 nested_cpu_has_shadow_vmcs(vmcs12) && 6096 vmcs12->vmcs_link_pointer != -1ull) 6097 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6098 } 6099 6100 if (vmx->nested.smm.vmxon) 6101 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6102 6103 if (vmx->nested.smm.guest_mode) 6104 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6105 6106 if (is_guest_mode(vcpu)) { 6107 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6108 6109 if (vmx->nested.nested_run_pending) 6110 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6111 6112 if (vmx->nested.mtf_pending) 6113 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6114 6115 if (nested_cpu_has_preemption_timer(vmcs12) && 6116 vmx->nested.has_preemption_timer_deadline) { 6117 kvm_state.hdr.vmx.flags |= 6118 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6119 kvm_state.hdr.vmx.preemption_timer_deadline = 6120 vmx->nested.preemption_timer_deadline; 6121 } 6122 } 6123 } 6124 6125 if (user_data_size < kvm_state.size) 6126 goto out; 6127 6128 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6129 return -EFAULT; 6130 6131 if (!vmx_has_valid_vmcs12(vcpu)) 6132 goto out; 6133 6134 /* 6135 * When running L2, the authoritative vmcs12 state is in the 6136 * vmcs02. When running L1, the authoritative vmcs12 state is 6137 * in the shadow or enlightened vmcs linked to vmcs01, unless 6138 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6139 * vmcs12 state is in the vmcs12 already. 6140 */ 6141 if (is_guest_mode(vcpu)) { 6142 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6143 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6144 } else { 6145 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6146 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6147 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 6148 /* 6149 * L1 hypervisor is not obliged to keep eVMCS 6150 * clean fields data always up-to-date while 6151 * not in guest mode, 'hv_clean_fields' is only 6152 * supposed to be actual upon vmentry so we need 6153 * to ignore it here and do full copy. 6154 */ 6155 copy_enlightened_to_vmcs12(vmx, 0); 6156 else if (enable_shadow_vmcs) 6157 copy_shadow_to_vmcs12(vmx); 6158 } 6159 } 6160 6161 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6162 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6163 6164 /* 6165 * Copy over the full allocated size of vmcs12 rather than just the size 6166 * of the struct. 6167 */ 6168 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6169 return -EFAULT; 6170 6171 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6172 vmcs12->vmcs_link_pointer != -1ull) { 6173 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6174 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6175 return -EFAULT; 6176 } 6177 out: 6178 return kvm_state.size; 6179 } 6180 6181 /* 6182 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 6183 */ 6184 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6185 { 6186 if (is_guest_mode(vcpu)) { 6187 to_vmx(vcpu)->nested.nested_run_pending = 0; 6188 nested_vmx_vmexit(vcpu, -1, 0, 0); 6189 } 6190 free_nested(vcpu); 6191 } 6192 6193 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6194 struct kvm_nested_state __user *user_kvm_nested_state, 6195 struct kvm_nested_state *kvm_state) 6196 { 6197 struct vcpu_vmx *vmx = to_vmx(vcpu); 6198 struct vmcs12 *vmcs12; 6199 enum vm_entry_failure_code ignored; 6200 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6201 &user_kvm_nested_state->data.vmx[0]; 6202 int ret; 6203 6204 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6205 return -EINVAL; 6206 6207 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { 6208 if (kvm_state->hdr.vmx.smm.flags) 6209 return -EINVAL; 6210 6211 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) 6212 return -EINVAL; 6213 6214 /* 6215 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6216 * enable eVMCS capability on vCPU. However, since then 6217 * code was changed such that flag signals vmcs12 should 6218 * be copied into eVMCS in guest memory. 6219 * 6220 * To preserve backwards compatability, allow user 6221 * to set this flag even when there is no VMXON region. 6222 */ 6223 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6224 return -EINVAL; 6225 } else { 6226 if (!nested_vmx_allowed(vcpu)) 6227 return -EINVAL; 6228 6229 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6230 return -EINVAL; 6231 } 6232 6233 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6234 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6235 return -EINVAL; 6236 6237 if (kvm_state->hdr.vmx.smm.flags & 6238 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6239 return -EINVAL; 6240 6241 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6242 return -EINVAL; 6243 6244 /* 6245 * SMM temporarily disables VMX, so we cannot be in guest mode, 6246 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6247 * must be zero. 6248 */ 6249 if (is_smm(vcpu) ? 6250 (kvm_state->flags & 6251 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6252 : kvm_state->hdr.vmx.smm.flags) 6253 return -EINVAL; 6254 6255 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6256 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6257 return -EINVAL; 6258 6259 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6260 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6261 return -EINVAL; 6262 6263 vmx_leave_nested(vcpu); 6264 6265 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) 6266 return 0; 6267 6268 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6269 ret = enter_vmx_operation(vcpu); 6270 if (ret) 6271 return ret; 6272 6273 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6274 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6275 /* See vmx_has_valid_vmcs12. */ 6276 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6277 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6278 (kvm_state->hdr.vmx.vmcs12_pa != -1ull)) 6279 return -EINVAL; 6280 else 6281 return 0; 6282 } 6283 6284 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { 6285 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6286 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6287 return -EINVAL; 6288 6289 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6290 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6291 /* 6292 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6293 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6294 * restored yet. EVMCS will be mapped from 6295 * nested_get_vmcs12_pages(). 6296 */ 6297 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6298 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6299 } else { 6300 return -EINVAL; 6301 } 6302 6303 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6304 vmx->nested.smm.vmxon = true; 6305 vmx->nested.vmxon = false; 6306 6307 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6308 vmx->nested.smm.guest_mode = true; 6309 } 6310 6311 vmcs12 = get_vmcs12(vcpu); 6312 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6313 return -EFAULT; 6314 6315 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6316 return -EINVAL; 6317 6318 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6319 return 0; 6320 6321 vmx->nested.nested_run_pending = 6322 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6323 6324 vmx->nested.mtf_pending = 6325 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6326 6327 ret = -EINVAL; 6328 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6329 vmcs12->vmcs_link_pointer != -1ull) { 6330 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6331 6332 if (kvm_state->size < 6333 sizeof(*kvm_state) + 6334 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6335 goto error_guest_mode; 6336 6337 if (copy_from_user(shadow_vmcs12, 6338 user_vmx_nested_state->shadow_vmcs12, 6339 sizeof(*shadow_vmcs12))) { 6340 ret = -EFAULT; 6341 goto error_guest_mode; 6342 } 6343 6344 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6345 !shadow_vmcs12->hdr.shadow_vmcs) 6346 goto error_guest_mode; 6347 } 6348 6349 vmx->nested.has_preemption_timer_deadline = false; 6350 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6351 vmx->nested.has_preemption_timer_deadline = true; 6352 vmx->nested.preemption_timer_deadline = 6353 kvm_state->hdr.vmx.preemption_timer_deadline; 6354 } 6355 6356 if (nested_vmx_check_controls(vcpu, vmcs12) || 6357 nested_vmx_check_host_state(vcpu, vmcs12) || 6358 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6359 goto error_guest_mode; 6360 6361 vmx->nested.dirty_vmcs12 = true; 6362 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6363 if (ret) 6364 goto error_guest_mode; 6365 6366 return 0; 6367 6368 error_guest_mode: 6369 vmx->nested.nested_run_pending = 0; 6370 return ret; 6371 } 6372 6373 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6374 { 6375 if (enable_shadow_vmcs) { 6376 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6377 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6378 } 6379 } 6380 6381 /* 6382 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6383 * that madness to get the encoding for comparison. 6384 */ 6385 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6386 6387 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6388 { 6389 /* 6390 * Note these are the so called "index" of the VMCS field encoding, not 6391 * the index into vmcs12. 6392 */ 6393 unsigned int max_idx, idx; 6394 int i; 6395 6396 /* 6397 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6398 * vmcs12, regardless of whether or not the associated feature is 6399 * exposed to L1. Simply find the field with the highest index. 6400 */ 6401 max_idx = 0; 6402 for (i = 0; i < nr_vmcs12_fields; i++) { 6403 /* The vmcs12 table is very, very sparsely populated. */ 6404 if (!vmcs_field_to_offset_table[i]) 6405 continue; 6406 6407 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6408 if (idx > max_idx) 6409 max_idx = idx; 6410 } 6411 6412 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6413 } 6414 6415 /* 6416 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6417 * returned for the various VMX controls MSRs when nested VMX is enabled. 6418 * The same values should also be used to verify that vmcs12 control fields are 6419 * valid during nested entry from L1 to L2. 6420 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6421 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6422 * bit in the high half is on if the corresponding bit in the control field 6423 * may be on. See also vmx_control_verify(). 6424 */ 6425 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) 6426 { 6427 /* 6428 * Note that as a general rule, the high half of the MSRs (bits in 6429 * the control fields which may be 1) should be initialized by the 6430 * intersection of the underlying hardware's MSR (i.e., features which 6431 * can be supported) and the list of features we want to expose - 6432 * because they are known to be properly supported in our code. 6433 * Also, usually, the low half of the MSRs (bits which must be 1) can 6434 * be set to 0, meaning that L1 may turn off any of these bits. The 6435 * reason is that if one of these bits is necessary, it will appear 6436 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6437 * fields of vmcs01 and vmcs02, will turn these bits off - and 6438 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6439 * These rules have exceptions below. 6440 */ 6441 6442 /* pin-based controls */ 6443 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 6444 msrs->pinbased_ctls_low, 6445 msrs->pinbased_ctls_high); 6446 msrs->pinbased_ctls_low |= 6447 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6448 msrs->pinbased_ctls_high &= 6449 PIN_BASED_EXT_INTR_MASK | 6450 PIN_BASED_NMI_EXITING | 6451 PIN_BASED_VIRTUAL_NMIS | 6452 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6453 msrs->pinbased_ctls_high |= 6454 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6455 PIN_BASED_VMX_PREEMPTION_TIMER; 6456 6457 /* exit controls */ 6458 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 6459 msrs->exit_ctls_low, 6460 msrs->exit_ctls_high); 6461 msrs->exit_ctls_low = 6462 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6463 6464 msrs->exit_ctls_high &= 6465 #ifdef CONFIG_X86_64 6466 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6467 #endif 6468 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6469 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6470 msrs->exit_ctls_high |= 6471 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6472 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6473 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 6474 6475 /* We support free control of debug control saving. */ 6476 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6477 6478 /* entry controls */ 6479 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 6480 msrs->entry_ctls_low, 6481 msrs->entry_ctls_high); 6482 msrs->entry_ctls_low = 6483 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6484 msrs->entry_ctls_high &= 6485 #ifdef CONFIG_X86_64 6486 VM_ENTRY_IA32E_MODE | 6487 #endif 6488 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 6489 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 6490 msrs->entry_ctls_high |= 6491 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6492 6493 /* We support free control of debug control loading. */ 6494 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6495 6496 /* cpu-based controls */ 6497 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6498 msrs->procbased_ctls_low, 6499 msrs->procbased_ctls_high); 6500 msrs->procbased_ctls_low = 6501 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6502 msrs->procbased_ctls_high &= 6503 CPU_BASED_INTR_WINDOW_EXITING | 6504 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6505 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6506 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6507 CPU_BASED_CR3_STORE_EXITING | 6508 #ifdef CONFIG_X86_64 6509 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6510 #endif 6511 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6512 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6513 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6514 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6515 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6516 /* 6517 * We can allow some features even when not supported by the 6518 * hardware. For example, L1 can specify an MSR bitmap - and we 6519 * can use it to avoid exits to L1 - even when L0 runs L2 6520 * without MSR bitmaps. 6521 */ 6522 msrs->procbased_ctls_high |= 6523 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6524 CPU_BASED_USE_MSR_BITMAPS; 6525 6526 /* We support free control of CR3 access interception. */ 6527 msrs->procbased_ctls_low &= 6528 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6529 6530 /* 6531 * secondary cpu-based controls. Do not include those that 6532 * depend on CPUID bits, they are added later by 6533 * vmx_vcpu_after_set_cpuid. 6534 */ 6535 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6536 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6537 msrs->secondary_ctls_low, 6538 msrs->secondary_ctls_high); 6539 6540 msrs->secondary_ctls_low = 0; 6541 msrs->secondary_ctls_high &= 6542 SECONDARY_EXEC_DESC | 6543 SECONDARY_EXEC_ENABLE_RDTSCP | 6544 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6545 SECONDARY_EXEC_WBINVD_EXITING | 6546 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6547 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6548 SECONDARY_EXEC_RDRAND_EXITING | 6549 SECONDARY_EXEC_ENABLE_INVPCID | 6550 SECONDARY_EXEC_RDSEED_EXITING | 6551 SECONDARY_EXEC_XSAVES | 6552 SECONDARY_EXEC_TSC_SCALING; 6553 6554 /* 6555 * We can emulate "VMCS shadowing," even if the hardware 6556 * doesn't support it. 6557 */ 6558 msrs->secondary_ctls_high |= 6559 SECONDARY_EXEC_SHADOW_VMCS; 6560 6561 if (enable_ept) { 6562 /* nested EPT: emulate EPT also to L1 */ 6563 msrs->secondary_ctls_high |= 6564 SECONDARY_EXEC_ENABLE_EPT; 6565 msrs->ept_caps = 6566 VMX_EPT_PAGE_WALK_4_BIT | 6567 VMX_EPT_PAGE_WALK_5_BIT | 6568 VMX_EPTP_WB_BIT | 6569 VMX_EPT_INVEPT_BIT | 6570 VMX_EPT_EXECUTE_ONLY_BIT; 6571 6572 msrs->ept_caps &= ept_caps; 6573 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6574 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6575 VMX_EPT_1GB_PAGE_BIT; 6576 if (enable_ept_ad_bits) { 6577 msrs->secondary_ctls_high |= 6578 SECONDARY_EXEC_ENABLE_PML; 6579 msrs->ept_caps |= VMX_EPT_AD_BIT; 6580 } 6581 } 6582 6583 if (cpu_has_vmx_vmfunc()) { 6584 msrs->secondary_ctls_high |= 6585 SECONDARY_EXEC_ENABLE_VMFUNC; 6586 /* 6587 * Advertise EPTP switching unconditionally 6588 * since we emulate it 6589 */ 6590 if (enable_ept) 6591 msrs->vmfunc_controls = 6592 VMX_VMFUNC_EPTP_SWITCHING; 6593 } 6594 6595 /* 6596 * Old versions of KVM use the single-context version without 6597 * checking for support, so declare that it is supported even 6598 * though it is treated as global context. The alternative is 6599 * not failing the single-context invvpid, and it is worse. 6600 */ 6601 if (enable_vpid) { 6602 msrs->secondary_ctls_high |= 6603 SECONDARY_EXEC_ENABLE_VPID; 6604 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6605 VMX_VPID_EXTENT_SUPPORTED_MASK; 6606 } 6607 6608 if (enable_unrestricted_guest) 6609 msrs->secondary_ctls_high |= 6610 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6611 6612 if (flexpriority_enabled) 6613 msrs->secondary_ctls_high |= 6614 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6615 6616 if (enable_sgx) 6617 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6618 6619 /* miscellaneous data */ 6620 rdmsr(MSR_IA32_VMX_MISC, 6621 msrs->misc_low, 6622 msrs->misc_high); 6623 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6624 msrs->misc_low |= 6625 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6626 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6627 VMX_MISC_ACTIVITY_HLT | 6628 VMX_MISC_ACTIVITY_WAIT_SIPI; 6629 msrs->misc_high = 0; 6630 6631 /* 6632 * This MSR reports some information about VMX support. We 6633 * should return information about the VMX we emulate for the 6634 * guest, and the VMCS structure we give it - not about the 6635 * VMX support of the underlying hardware. 6636 */ 6637 msrs->basic = 6638 VMCS12_REVISION | 6639 VMX_BASIC_TRUE_CTLS | 6640 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6641 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6642 6643 if (cpu_has_vmx_basic_inout()) 6644 msrs->basic |= VMX_BASIC_INOUT; 6645 6646 /* 6647 * These MSRs specify bits which the guest must keep fixed on 6648 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6649 * We picked the standard core2 setting. 6650 */ 6651 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6652 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6653 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6654 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6655 6656 /* These MSRs specify bits which the guest must keep fixed off. */ 6657 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6658 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6659 6660 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 6661 } 6662 6663 void nested_vmx_hardware_unsetup(void) 6664 { 6665 int i; 6666 6667 if (enable_shadow_vmcs) { 6668 for (i = 0; i < VMX_BITMAP_NR; i++) 6669 free_page((unsigned long)vmx_bitmap[i]); 6670 } 6671 } 6672 6673 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6674 { 6675 int i; 6676 6677 if (!cpu_has_vmx_shadow_vmcs()) 6678 enable_shadow_vmcs = 0; 6679 if (enable_shadow_vmcs) { 6680 for (i = 0; i < VMX_BITMAP_NR; i++) { 6681 /* 6682 * The vmx_bitmap is not tied to a VM and so should 6683 * not be charged to a memcg. 6684 */ 6685 vmx_bitmap[i] = (unsigned long *) 6686 __get_free_page(GFP_KERNEL); 6687 if (!vmx_bitmap[i]) { 6688 nested_vmx_hardware_unsetup(); 6689 return -ENOMEM; 6690 } 6691 } 6692 6693 init_vmcs_shadow_fields(); 6694 } 6695 6696 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6697 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6698 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6699 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6700 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6701 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6702 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6703 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; 6704 exit_handlers[EXIT_REASON_VMON] = handle_vmon; 6705 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6706 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6707 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6708 6709 return 0; 6710 } 6711 6712 struct kvm_x86_nested_ops vmx_nested_ops = { 6713 .check_events = vmx_check_nested_events, 6714 .hv_timer_pending = nested_vmx_preemption_timer_pending, 6715 .triple_fault = nested_vmx_triple_fault, 6716 .get_state = vmx_get_nested_state, 6717 .set_state = vmx_set_nested_state, 6718 .get_nested_state_pages = vmx_get_nested_state_pages, 6719 .write_log_dirty = nested_vmx_write_pml_buffer, 6720 .enable_evmcs = nested_enable_evmcs, 6721 .get_evmcs_version = nested_get_evmcs_version, 6722 }; 6723