1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/objtool.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "pmu.h" 14 #include "sgx.h" 15 #include "trace.h" 16 #include "vmx.h" 17 #include "x86.h" 18 19 static bool __read_mostly enable_shadow_vmcs = 1; 20 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 21 22 static bool __read_mostly nested_early_check = 0; 23 module_param(nested_early_check, bool, S_IRUGO); 24 25 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 26 27 /* 28 * Hyper-V requires all of these, so mark them as supported even though 29 * they are just treated the same as all-context. 30 */ 31 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 32 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 33 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 34 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 36 37 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 38 39 enum { 40 VMX_VMREAD_BITMAP, 41 VMX_VMWRITE_BITMAP, 42 VMX_BITMAP_NR 43 }; 44 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 45 46 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 47 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 48 49 struct shadow_vmcs_field { 50 u16 encoding; 51 u16 offset; 52 }; 53 static struct shadow_vmcs_field shadow_read_only_fields[] = { 54 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 55 #include "vmcs_shadow_fields.h" 56 }; 57 static int max_shadow_read_only_fields = 58 ARRAY_SIZE(shadow_read_only_fields); 59 60 static struct shadow_vmcs_field shadow_read_write_fields[] = { 61 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 62 #include "vmcs_shadow_fields.h" 63 }; 64 static int max_shadow_read_write_fields = 65 ARRAY_SIZE(shadow_read_write_fields); 66 67 static void init_vmcs_shadow_fields(void) 68 { 69 int i, j; 70 71 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 72 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 73 74 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 75 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 76 u16 field = entry.encoding; 77 78 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 79 (i + 1 == max_shadow_read_only_fields || 80 shadow_read_only_fields[i + 1].encoding != field + 1)) 81 pr_err("Missing field from shadow_read_only_field %x\n", 82 field + 1); 83 84 clear_bit(field, vmx_vmread_bitmap); 85 if (field & 1) 86 #ifdef CONFIG_X86_64 87 continue; 88 #else 89 entry.offset += sizeof(u32); 90 #endif 91 shadow_read_only_fields[j++] = entry; 92 } 93 max_shadow_read_only_fields = j; 94 95 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 96 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 97 u16 field = entry.encoding; 98 99 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 100 (i + 1 == max_shadow_read_write_fields || 101 shadow_read_write_fields[i + 1].encoding != field + 1)) 102 pr_err("Missing field from shadow_read_write_field %x\n", 103 field + 1); 104 105 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 106 field <= GUEST_TR_AR_BYTES, 107 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 108 109 /* 110 * PML and the preemption timer can be emulated, but the 111 * processor cannot vmwrite to fields that don't exist 112 * on bare metal. 113 */ 114 switch (field) { 115 case GUEST_PML_INDEX: 116 if (!cpu_has_vmx_pml()) 117 continue; 118 break; 119 case VMX_PREEMPTION_TIMER_VALUE: 120 if (!cpu_has_vmx_preemption_timer()) 121 continue; 122 break; 123 case GUEST_INTR_STATUS: 124 if (!cpu_has_vmx_apicv()) 125 continue; 126 break; 127 default: 128 break; 129 } 130 131 clear_bit(field, vmx_vmwrite_bitmap); 132 clear_bit(field, vmx_vmread_bitmap); 133 if (field & 1) 134 #ifdef CONFIG_X86_64 135 continue; 136 #else 137 entry.offset += sizeof(u32); 138 #endif 139 shadow_read_write_fields[j++] = entry; 140 } 141 max_shadow_read_write_fields = j; 142 } 143 144 /* 145 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 146 * set the success or error code of an emulated VMX instruction (as specified 147 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 148 * instruction. 149 */ 150 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 151 { 152 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 153 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 154 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 155 return kvm_skip_emulated_instruction(vcpu); 156 } 157 158 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 162 X86_EFLAGS_SF | X86_EFLAGS_OF)) 163 | X86_EFLAGS_CF); 164 return kvm_skip_emulated_instruction(vcpu); 165 } 166 167 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 168 u32 vm_instruction_error) 169 { 170 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 171 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 172 X86_EFLAGS_SF | X86_EFLAGS_OF)) 173 | X86_EFLAGS_ZF); 174 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 175 /* 176 * We don't need to force a shadow sync because 177 * VM_INSTRUCTION_ERROR is not shadowed 178 */ 179 return kvm_skip_emulated_instruction(vcpu); 180 } 181 182 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 183 { 184 struct vcpu_vmx *vmx = to_vmx(vcpu); 185 186 /* 187 * failValid writes the error number to the current VMCS, which 188 * can't be done if there isn't a current VMCS. 189 */ 190 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) 191 return nested_vmx_failInvalid(vcpu); 192 193 return nested_vmx_failValid(vcpu, vm_instruction_error); 194 } 195 196 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 197 { 198 /* TODO: not to reset guest simply here. */ 199 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 200 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 201 } 202 203 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 204 { 205 return fixed_bits_valid(control, low, high); 206 } 207 208 static inline u64 vmx_control_msr(u32 low, u32 high) 209 { 210 return low | ((u64)high << 32); 211 } 212 213 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 214 { 215 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 216 vmcs_write64(VMCS_LINK_POINTER, -1ull); 217 vmx->nested.need_vmcs12_to_shadow_sync = false; 218 } 219 220 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 221 { 222 struct vcpu_vmx *vmx = to_vmx(vcpu); 223 224 if (!vmx->nested.hv_evmcs) 225 return; 226 227 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 228 vmx->nested.hv_evmcs_vmptr = 0; 229 vmx->nested.hv_evmcs = NULL; 230 } 231 232 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 233 struct loaded_vmcs *prev) 234 { 235 struct vmcs_host_state *dest, *src; 236 237 if (unlikely(!vmx->guest_state_loaded)) 238 return; 239 240 src = &prev->host_state; 241 dest = &vmx->loaded_vmcs->host_state; 242 243 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 244 dest->ldt_sel = src->ldt_sel; 245 #ifdef CONFIG_X86_64 246 dest->ds_sel = src->ds_sel; 247 dest->es_sel = src->es_sel; 248 #endif 249 } 250 251 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 252 { 253 struct vcpu_vmx *vmx = to_vmx(vcpu); 254 struct loaded_vmcs *prev; 255 int cpu; 256 257 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 258 return; 259 260 cpu = get_cpu(); 261 prev = vmx->loaded_vmcs; 262 vmx->loaded_vmcs = vmcs; 263 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 264 vmx_sync_vmcs_host_state(vmx, prev); 265 put_cpu(); 266 267 vmx_register_cache_reset(vcpu); 268 } 269 270 /* 271 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 272 * just stops using VMX. 273 */ 274 static void free_nested(struct kvm_vcpu *vcpu) 275 { 276 struct vcpu_vmx *vmx = to_vmx(vcpu); 277 278 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 279 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 280 281 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 282 return; 283 284 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 285 286 vmx->nested.vmxon = false; 287 vmx->nested.smm.vmxon = false; 288 free_vpid(vmx->nested.vpid02); 289 vmx->nested.posted_intr_nv = -1; 290 vmx->nested.current_vmptr = -1ull; 291 if (enable_shadow_vmcs) { 292 vmx_disable_shadow_vmcs(vmx); 293 vmcs_clear(vmx->vmcs01.shadow_vmcs); 294 free_vmcs(vmx->vmcs01.shadow_vmcs); 295 vmx->vmcs01.shadow_vmcs = NULL; 296 } 297 kfree(vmx->nested.cached_vmcs12); 298 vmx->nested.cached_vmcs12 = NULL; 299 kfree(vmx->nested.cached_shadow_vmcs12); 300 vmx->nested.cached_shadow_vmcs12 = NULL; 301 /* Unpin physical memory we referred to in the vmcs02 */ 302 if (vmx->nested.apic_access_page) { 303 kvm_release_page_clean(vmx->nested.apic_access_page); 304 vmx->nested.apic_access_page = NULL; 305 } 306 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 307 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 308 vmx->nested.pi_desc = NULL; 309 310 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 311 312 nested_release_evmcs(vcpu); 313 314 free_loaded_vmcs(&vmx->nested.vmcs02); 315 } 316 317 /* 318 * Ensure that the current vmcs of the logical processor is the 319 * vmcs01 of the vcpu before calling free_nested(). 320 */ 321 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 322 { 323 vcpu_load(vcpu); 324 vmx_leave_nested(vcpu); 325 vcpu_put(vcpu); 326 } 327 328 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 329 struct x86_exception *fault) 330 { 331 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 332 struct vcpu_vmx *vmx = to_vmx(vcpu); 333 u32 vm_exit_reason; 334 unsigned long exit_qualification = vcpu->arch.exit_qualification; 335 336 if (vmx->nested.pml_full) { 337 vm_exit_reason = EXIT_REASON_PML_FULL; 338 vmx->nested.pml_full = false; 339 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 340 } else if (fault->error_code & PFERR_RSVD_MASK) 341 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 342 else 343 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 344 345 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 346 vmcs12->guest_physical_address = fault->address; 347 } 348 349 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 350 { 351 WARN_ON(mmu_is_nested(vcpu)); 352 353 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 354 kvm_init_shadow_ept_mmu(vcpu, 355 to_vmx(vcpu)->nested.msrs.ept_caps & 356 VMX_EPT_EXECUTE_ONLY_BIT, 357 nested_ept_ad_enabled(vcpu), 358 nested_ept_get_eptp(vcpu)); 359 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 360 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 361 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 362 363 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 364 } 365 366 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 367 { 368 vcpu->arch.mmu = &vcpu->arch.root_mmu; 369 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 370 } 371 372 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 373 u16 error_code) 374 { 375 bool inequality, bit; 376 377 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 378 inequality = 379 (error_code & vmcs12->page_fault_error_code_mask) != 380 vmcs12->page_fault_error_code_match; 381 return inequality ^ bit; 382 } 383 384 385 /* 386 * KVM wants to inject page-faults which it got to the guest. This function 387 * checks whether in a nested guest, we need to inject them to L1 or L2. 388 */ 389 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 390 { 391 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 392 unsigned int nr = vcpu->arch.exception.nr; 393 bool has_payload = vcpu->arch.exception.has_payload; 394 unsigned long payload = vcpu->arch.exception.payload; 395 396 if (nr == PF_VECTOR) { 397 if (vcpu->arch.exception.nested_apf) { 398 *exit_qual = vcpu->arch.apf.nested_apf_token; 399 return 1; 400 } 401 if (nested_vmx_is_page_fault_vmexit(vmcs12, 402 vcpu->arch.exception.error_code)) { 403 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 404 return 1; 405 } 406 } else if (vmcs12->exception_bitmap & (1u << nr)) { 407 if (nr == DB_VECTOR) { 408 if (!has_payload) { 409 payload = vcpu->arch.dr6; 410 payload &= ~DR6_BT; 411 payload ^= DR6_ACTIVE_LOW; 412 } 413 *exit_qual = payload; 414 } else 415 *exit_qual = 0; 416 return 1; 417 } 418 419 return 0; 420 } 421 422 423 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 424 struct x86_exception *fault) 425 { 426 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 427 428 WARN_ON(!is_guest_mode(vcpu)); 429 430 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 431 !to_vmx(vcpu)->nested.nested_run_pending) { 432 vmcs12->vm_exit_intr_error_code = fault->error_code; 433 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 434 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 435 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 436 fault->address); 437 } else { 438 kvm_inject_page_fault(vcpu, fault); 439 } 440 } 441 442 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 443 struct vmcs12 *vmcs12) 444 { 445 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 446 return 0; 447 448 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 449 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 450 return -EINVAL; 451 452 return 0; 453 } 454 455 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 456 struct vmcs12 *vmcs12) 457 { 458 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 459 return 0; 460 461 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 462 return -EINVAL; 463 464 return 0; 465 } 466 467 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 468 struct vmcs12 *vmcs12) 469 { 470 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 471 return 0; 472 473 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 474 return -EINVAL; 475 476 return 0; 477 } 478 479 /* 480 * Check if MSR is intercepted for L01 MSR bitmap. 481 */ 482 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) 483 { 484 unsigned long *msr_bitmap; 485 int f = sizeof(unsigned long); 486 487 if (!cpu_has_vmx_msr_bitmap()) 488 return true; 489 490 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 491 492 if (msr <= 0x1fff) { 493 return !!test_bit(msr, msr_bitmap + 0x800 / f); 494 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 495 msr &= 0x1fff; 496 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 497 } 498 499 return true; 500 } 501 502 /* 503 * If a msr is allowed by L0, we should check whether it is allowed by L1. 504 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 505 */ 506 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 507 unsigned long *msr_bitmap_nested, 508 u32 msr, int type) 509 { 510 int f = sizeof(unsigned long); 511 512 /* 513 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 514 * have the write-low and read-high bitmap offsets the wrong way round. 515 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 516 */ 517 if (msr <= 0x1fff) { 518 if (type & MSR_TYPE_R && 519 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 520 /* read-low */ 521 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 522 523 if (type & MSR_TYPE_W && 524 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 525 /* write-low */ 526 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 527 528 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 529 msr &= 0x1fff; 530 if (type & MSR_TYPE_R && 531 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 532 /* read-high */ 533 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 534 535 if (type & MSR_TYPE_W && 536 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 537 /* write-high */ 538 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 539 540 } 541 } 542 543 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 544 { 545 int msr; 546 547 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 548 unsigned word = msr / BITS_PER_LONG; 549 550 msr_bitmap[word] = ~0; 551 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 552 } 553 } 554 555 /* 556 * Merge L0's and L1's MSR bitmap, return false to indicate that 557 * we do not use the hardware. 558 */ 559 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 560 struct vmcs12 *vmcs12) 561 { 562 int msr; 563 unsigned long *msr_bitmap_l1; 564 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 565 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; 566 567 /* Nothing to do if the MSR bitmap is not in use. */ 568 if (!cpu_has_vmx_msr_bitmap() || 569 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 570 return false; 571 572 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 573 return false; 574 575 msr_bitmap_l1 = (unsigned long *)map->hva; 576 577 /* 578 * To keep the control flow simple, pay eight 8-byte writes (sixteen 579 * 4-byte writes on 32-bit systems) up front to enable intercepts for 580 * the x2APIC MSR range and selectively disable them below. 581 */ 582 enable_x2apic_msr_intercepts(msr_bitmap_l0); 583 584 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 585 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 586 /* 587 * L0 need not intercept reads for MSRs between 0x800 588 * and 0x8ff, it just lets the processor take the value 589 * from the virtual-APIC page; take those 256 bits 590 * directly from the L1 bitmap. 591 */ 592 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 593 unsigned word = msr / BITS_PER_LONG; 594 595 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 596 } 597 } 598 599 nested_vmx_disable_intercept_for_msr( 600 msr_bitmap_l1, msr_bitmap_l0, 601 X2APIC_MSR(APIC_TASKPRI), 602 MSR_TYPE_R | MSR_TYPE_W); 603 604 if (nested_cpu_has_vid(vmcs12)) { 605 nested_vmx_disable_intercept_for_msr( 606 msr_bitmap_l1, msr_bitmap_l0, 607 X2APIC_MSR(APIC_EOI), 608 MSR_TYPE_W); 609 nested_vmx_disable_intercept_for_msr( 610 msr_bitmap_l1, msr_bitmap_l0, 611 X2APIC_MSR(APIC_SELF_IPI), 612 MSR_TYPE_W); 613 } 614 } 615 616 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ 617 #ifdef CONFIG_X86_64 618 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 619 MSR_FS_BASE, MSR_TYPE_RW); 620 621 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 622 MSR_GS_BASE, MSR_TYPE_RW); 623 624 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 625 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 626 #endif 627 628 /* 629 * Checking the L0->L1 bitmap is trying to verify two things: 630 * 631 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This 632 * ensures that we do not accidentally generate an L02 MSR bitmap 633 * from the L12 MSR bitmap that is too permissive. 634 * 2. That L1 or L2s have actually used the MSR. This avoids 635 * unnecessarily merging of the bitmap if the MSR is unused. This 636 * works properly because we only update the L01 MSR bitmap lazily. 637 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only 638 * updated to reflect this when L1 (or its L2s) actually write to 639 * the MSR. 640 */ 641 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) 642 nested_vmx_disable_intercept_for_msr( 643 msr_bitmap_l1, msr_bitmap_l0, 644 MSR_IA32_SPEC_CTRL, 645 MSR_TYPE_R | MSR_TYPE_W); 646 647 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) 648 nested_vmx_disable_intercept_for_msr( 649 msr_bitmap_l1, msr_bitmap_l0, 650 MSR_IA32_PRED_CMD, 651 MSR_TYPE_W); 652 653 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); 654 655 return true; 656 } 657 658 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 659 struct vmcs12 *vmcs12) 660 { 661 struct kvm_host_map map; 662 struct vmcs12 *shadow; 663 664 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 665 vmcs12->vmcs_link_pointer == -1ull) 666 return; 667 668 shadow = get_shadow_vmcs12(vcpu); 669 670 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 671 return; 672 673 memcpy(shadow, map.hva, VMCS12_SIZE); 674 kvm_vcpu_unmap(vcpu, &map, false); 675 } 676 677 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 678 struct vmcs12 *vmcs12) 679 { 680 struct vcpu_vmx *vmx = to_vmx(vcpu); 681 682 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 683 vmcs12->vmcs_link_pointer == -1ull) 684 return; 685 686 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 687 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 688 } 689 690 /* 691 * In nested virtualization, check if L1 has set 692 * VM_EXIT_ACK_INTR_ON_EXIT 693 */ 694 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 695 { 696 return get_vmcs12(vcpu)->vm_exit_controls & 697 VM_EXIT_ACK_INTR_ON_EXIT; 698 } 699 700 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 701 struct vmcs12 *vmcs12) 702 { 703 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 704 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 705 return -EINVAL; 706 else 707 return 0; 708 } 709 710 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 711 struct vmcs12 *vmcs12) 712 { 713 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 714 !nested_cpu_has_apic_reg_virt(vmcs12) && 715 !nested_cpu_has_vid(vmcs12) && 716 !nested_cpu_has_posted_intr(vmcs12)) 717 return 0; 718 719 /* 720 * If virtualize x2apic mode is enabled, 721 * virtualize apic access must be disabled. 722 */ 723 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 724 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 725 return -EINVAL; 726 727 /* 728 * If virtual interrupt delivery is enabled, 729 * we must exit on external interrupts. 730 */ 731 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 732 return -EINVAL; 733 734 /* 735 * bits 15:8 should be zero in posted_intr_nv, 736 * the descriptor address has been already checked 737 * in nested_get_vmcs12_pages. 738 * 739 * bits 5:0 of posted_intr_desc_addr should be zero. 740 */ 741 if (nested_cpu_has_posted_intr(vmcs12) && 742 (CC(!nested_cpu_has_vid(vmcs12)) || 743 CC(!nested_exit_intr_ack_set(vcpu)) || 744 CC((vmcs12->posted_intr_nv & 0xff00)) || 745 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 746 return -EINVAL; 747 748 /* tpr shadow is needed by all apicv features. */ 749 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 750 return -EINVAL; 751 752 return 0; 753 } 754 755 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 756 u32 count, u64 addr) 757 { 758 if (count == 0) 759 return 0; 760 761 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 762 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 763 return -EINVAL; 764 765 return 0; 766 } 767 768 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 769 struct vmcs12 *vmcs12) 770 { 771 if (CC(nested_vmx_check_msr_switch(vcpu, 772 vmcs12->vm_exit_msr_load_count, 773 vmcs12->vm_exit_msr_load_addr)) || 774 CC(nested_vmx_check_msr_switch(vcpu, 775 vmcs12->vm_exit_msr_store_count, 776 vmcs12->vm_exit_msr_store_addr))) 777 return -EINVAL; 778 779 return 0; 780 } 781 782 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 783 struct vmcs12 *vmcs12) 784 { 785 if (CC(nested_vmx_check_msr_switch(vcpu, 786 vmcs12->vm_entry_msr_load_count, 787 vmcs12->vm_entry_msr_load_addr))) 788 return -EINVAL; 789 790 return 0; 791 } 792 793 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 794 struct vmcs12 *vmcs12) 795 { 796 if (!nested_cpu_has_pml(vmcs12)) 797 return 0; 798 799 if (CC(!nested_cpu_has_ept(vmcs12)) || 800 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 801 return -EINVAL; 802 803 return 0; 804 } 805 806 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 807 struct vmcs12 *vmcs12) 808 { 809 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 810 !nested_cpu_has_ept(vmcs12))) 811 return -EINVAL; 812 return 0; 813 } 814 815 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 816 struct vmcs12 *vmcs12) 817 { 818 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 819 !nested_cpu_has_ept(vmcs12))) 820 return -EINVAL; 821 return 0; 822 } 823 824 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 825 struct vmcs12 *vmcs12) 826 { 827 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 828 return 0; 829 830 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 831 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 832 return -EINVAL; 833 834 return 0; 835 } 836 837 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 838 struct vmx_msr_entry *e) 839 { 840 /* x2APIC MSR accesses are not allowed */ 841 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 842 return -EINVAL; 843 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 844 CC(e->index == MSR_IA32_UCODE_REV)) 845 return -EINVAL; 846 if (CC(e->reserved != 0)) 847 return -EINVAL; 848 return 0; 849 } 850 851 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 852 struct vmx_msr_entry *e) 853 { 854 if (CC(e->index == MSR_FS_BASE) || 855 CC(e->index == MSR_GS_BASE) || 856 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 857 nested_vmx_msr_check_common(vcpu, e)) 858 return -EINVAL; 859 return 0; 860 } 861 862 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 863 struct vmx_msr_entry *e) 864 { 865 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 866 nested_vmx_msr_check_common(vcpu, e)) 867 return -EINVAL; 868 return 0; 869 } 870 871 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 872 { 873 struct vcpu_vmx *vmx = to_vmx(vcpu); 874 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 875 vmx->nested.msrs.misc_high); 876 877 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 878 } 879 880 /* 881 * Load guest's/host's msr at nested entry/exit. 882 * return 0 for success, entry index for failure. 883 * 884 * One of the failure modes for MSR load/store is when a list exceeds the 885 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 886 * as possible, process all valid entries before failing rather than precheck 887 * for a capacity violation. 888 */ 889 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 890 { 891 u32 i; 892 struct vmx_msr_entry e; 893 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 894 895 for (i = 0; i < count; i++) { 896 if (unlikely(i >= max_msr_list_size)) 897 goto fail; 898 899 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 900 &e, sizeof(e))) { 901 pr_debug_ratelimited( 902 "%s cannot read MSR entry (%u, 0x%08llx)\n", 903 __func__, i, gpa + i * sizeof(e)); 904 goto fail; 905 } 906 if (nested_vmx_load_msr_check(vcpu, &e)) { 907 pr_debug_ratelimited( 908 "%s check failed (%u, 0x%x, 0x%x)\n", 909 __func__, i, e.index, e.reserved); 910 goto fail; 911 } 912 if (kvm_set_msr(vcpu, e.index, e.value)) { 913 pr_debug_ratelimited( 914 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 915 __func__, i, e.index, e.value); 916 goto fail; 917 } 918 } 919 return 0; 920 fail: 921 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 922 return i + 1; 923 } 924 925 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 926 u32 msr_index, 927 u64 *data) 928 { 929 struct vcpu_vmx *vmx = to_vmx(vcpu); 930 931 /* 932 * If the L0 hypervisor stored a more accurate value for the TSC that 933 * does not include the time taken for emulation of the L2->L1 934 * VM-exit in L0, use the more accurate value. 935 */ 936 if (msr_index == MSR_IA32_TSC) { 937 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 938 MSR_IA32_TSC); 939 940 if (i >= 0) { 941 u64 val = vmx->msr_autostore.guest.val[i].value; 942 943 *data = kvm_read_l1_tsc(vcpu, val); 944 return true; 945 } 946 } 947 948 if (kvm_get_msr(vcpu, msr_index, data)) { 949 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 950 msr_index); 951 return false; 952 } 953 return true; 954 } 955 956 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 957 struct vmx_msr_entry *e) 958 { 959 if (kvm_vcpu_read_guest(vcpu, 960 gpa + i * sizeof(*e), 961 e, 2 * sizeof(u32))) { 962 pr_debug_ratelimited( 963 "%s cannot read MSR entry (%u, 0x%08llx)\n", 964 __func__, i, gpa + i * sizeof(*e)); 965 return false; 966 } 967 if (nested_vmx_store_msr_check(vcpu, e)) { 968 pr_debug_ratelimited( 969 "%s check failed (%u, 0x%x, 0x%x)\n", 970 __func__, i, e->index, e->reserved); 971 return false; 972 } 973 return true; 974 } 975 976 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 977 { 978 u64 data; 979 u32 i; 980 struct vmx_msr_entry e; 981 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 982 983 for (i = 0; i < count; i++) { 984 if (unlikely(i >= max_msr_list_size)) 985 return -EINVAL; 986 987 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 988 return -EINVAL; 989 990 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 991 return -EINVAL; 992 993 if (kvm_vcpu_write_guest(vcpu, 994 gpa + i * sizeof(e) + 995 offsetof(struct vmx_msr_entry, value), 996 &data, sizeof(data))) { 997 pr_debug_ratelimited( 998 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 999 __func__, i, e.index, data); 1000 return -EINVAL; 1001 } 1002 } 1003 return 0; 1004 } 1005 1006 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1007 { 1008 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1009 u32 count = vmcs12->vm_exit_msr_store_count; 1010 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1011 struct vmx_msr_entry e; 1012 u32 i; 1013 1014 for (i = 0; i < count; i++) { 1015 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1016 return false; 1017 1018 if (e.index == msr_index) 1019 return true; 1020 } 1021 return false; 1022 } 1023 1024 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1025 u32 msr_index) 1026 { 1027 struct vcpu_vmx *vmx = to_vmx(vcpu); 1028 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1029 bool in_vmcs12_store_list; 1030 int msr_autostore_slot; 1031 bool in_autostore_list; 1032 int last; 1033 1034 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1035 in_autostore_list = msr_autostore_slot >= 0; 1036 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1037 1038 if (in_vmcs12_store_list && !in_autostore_list) { 1039 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1040 /* 1041 * Emulated VMEntry does not fail here. Instead a less 1042 * accurate value will be returned by 1043 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1044 * instead of reading the value from the vmcs02 VMExit 1045 * MSR-store area. 1046 */ 1047 pr_warn_ratelimited( 1048 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1049 msr_index); 1050 return; 1051 } 1052 last = autostore->nr++; 1053 autostore->val[last].index = msr_index; 1054 } else if (!in_vmcs12_store_list && in_autostore_list) { 1055 last = --autostore->nr; 1056 autostore->val[msr_autostore_slot] = autostore->val[last]; 1057 } 1058 } 1059 1060 /* 1061 * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. 1062 * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't 1063 * enable VPID for L2 (implying it expects a TLB flush on VMX transitions). 1064 * Here's why. 1065 * 1066 * If EPT is enabled by L0 a sync is never needed: 1067 * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there 1068 * cannot be unsync'd SPTEs for either L1 or L2. 1069 * 1070 * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter 1071 * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings 1072 * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush 1073 * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't 1074 * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit. 1075 * 1076 * If EPT is disabled by L0: 1077 * - if VPID is enabled by L1 (for L2), the situation is similar to when L1 1078 * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't 1079 * required to invalidate linear mappings (EPT is disabled so there are 1080 * no combined or guest-physical mappings), i.e. L1 can't rely on the 1081 * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1). 1082 * 1083 * - however if VPID is disabled by L1, then a sync is needed as L1 expects all 1084 * linear mappings (EPT is disabled so there are no combined or guest-physical 1085 * mappings) to be invalidated on both VM-Enter and VM-Exit. 1086 * 1087 * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which 1088 * additionally checks that L2 has been assigned a VPID (when EPT is disabled). 1089 * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect 1090 * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2 1091 * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has 1092 * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1 1093 * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't 1094 * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush 1095 * stale TLB entries, at which point L0 will sync L2's MMU. 1096 */ 1097 static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) 1098 { 1099 return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu)); 1100 } 1101 1102 /* 1103 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1104 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1105 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1106 * @entry_failure_code. 1107 */ 1108 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, 1109 enum vm_entry_failure_code *entry_failure_code) 1110 { 1111 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1112 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1113 return -EINVAL; 1114 } 1115 1116 /* 1117 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1118 * must not be dereferenced. 1119 */ 1120 if (!nested_ept && is_pae_paging(vcpu) && 1121 (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { 1122 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1123 *entry_failure_code = ENTRY_FAIL_PDPTE; 1124 return -EINVAL; 1125 } 1126 } 1127 1128 /* 1129 * Unconditionally skip the TLB flush on fast CR3 switch, all TLB 1130 * flushes are handled by nested_vmx_transition_tlb_flush(). See 1131 * nested_vmx_transition_mmu_sync for details on skipping the MMU sync. 1132 */ 1133 if (!nested_ept) 1134 kvm_mmu_new_pgd(vcpu, cr3, true, 1135 !nested_vmx_transition_mmu_sync(vcpu)); 1136 1137 vcpu->arch.cr3 = cr3; 1138 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1139 1140 kvm_init_mmu(vcpu, false); 1141 1142 return 0; 1143 } 1144 1145 /* 1146 * Returns if KVM is able to config CPU to tag TLB entries 1147 * populated by L2 differently than TLB entries populated 1148 * by L1. 1149 * 1150 * If L0 uses EPT, L1 and L2 run with different EPTP because 1151 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1152 * are tagged with different EPTP. 1153 * 1154 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1155 * with different VPID (L1 entries are tagged with vmx->vpid 1156 * while L2 entries are tagged with vmx->nested.vpid02). 1157 */ 1158 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1159 { 1160 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1161 1162 return enable_ept || 1163 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1164 } 1165 1166 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1167 struct vmcs12 *vmcs12, 1168 bool is_vmenter) 1169 { 1170 struct vcpu_vmx *vmx = to_vmx(vcpu); 1171 1172 /* 1173 * If VPID is disabled, linear and combined mappings are flushed on 1174 * VM-Enter/VM-Exit, and guest-physical mappings are valid only for 1175 * their associated EPTP. 1176 */ 1177 if (!enable_vpid) 1178 return; 1179 1180 /* 1181 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1182 * for *all* contexts to be flushed on VM-Enter/VM-Exit. 1183 * 1184 * If VPID is enabled and used by vmc12, but L2 does not have a unique 1185 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate 1186 * a VPID for L2, flush the current context as the effective ASID is 1187 * common to both L1 and L2. 1188 * 1189 * Defer the flush so that it runs after vmcs02.EPTP has been set by 1190 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid 1191 * redundant flushes further down the nested pipeline. 1192 * 1193 * If a TLB flush isn't required due to any of the above, and vpid12 is 1194 * changing then the new "virtual" VPID (vpid12) will reuse the same 1195 * "real" VPID (vpid02), and so needs to be sync'd. There is no direct 1196 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for 1197 * all nested vCPUs. 1198 */ 1199 if (!nested_cpu_has_vpid(vmcs12)) { 1200 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1201 } else if (!nested_has_guest_tlb_tag(vcpu)) { 1202 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1203 } else if (is_vmenter && 1204 vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1205 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1206 vpid_sync_context(nested_get_vpid02(vcpu)); 1207 } 1208 } 1209 1210 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1211 { 1212 superset &= mask; 1213 subset &= mask; 1214 1215 return (superset | subset) == superset; 1216 } 1217 1218 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1219 { 1220 const u64 feature_and_reserved = 1221 /* feature (except bit 48; see below) */ 1222 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1223 /* reserved */ 1224 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1225 u64 vmx_basic = vmx->nested.msrs.basic; 1226 1227 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1228 return -EINVAL; 1229 1230 /* 1231 * KVM does not emulate a version of VMX that constrains physical 1232 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1233 */ 1234 if (data & BIT_ULL(48)) 1235 return -EINVAL; 1236 1237 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1238 vmx_basic_vmcs_revision_id(data)) 1239 return -EINVAL; 1240 1241 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1242 return -EINVAL; 1243 1244 vmx->nested.msrs.basic = data; 1245 return 0; 1246 } 1247 1248 static int 1249 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1250 { 1251 u64 supported; 1252 u32 *lowp, *highp; 1253 1254 switch (msr_index) { 1255 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1256 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1257 highp = &vmx->nested.msrs.pinbased_ctls_high; 1258 break; 1259 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1260 lowp = &vmx->nested.msrs.procbased_ctls_low; 1261 highp = &vmx->nested.msrs.procbased_ctls_high; 1262 break; 1263 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1264 lowp = &vmx->nested.msrs.exit_ctls_low; 1265 highp = &vmx->nested.msrs.exit_ctls_high; 1266 break; 1267 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1268 lowp = &vmx->nested.msrs.entry_ctls_low; 1269 highp = &vmx->nested.msrs.entry_ctls_high; 1270 break; 1271 case MSR_IA32_VMX_PROCBASED_CTLS2: 1272 lowp = &vmx->nested.msrs.secondary_ctls_low; 1273 highp = &vmx->nested.msrs.secondary_ctls_high; 1274 break; 1275 default: 1276 BUG(); 1277 } 1278 1279 supported = vmx_control_msr(*lowp, *highp); 1280 1281 /* Check must-be-1 bits are still 1. */ 1282 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1283 return -EINVAL; 1284 1285 /* Check must-be-0 bits are still 0. */ 1286 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1287 return -EINVAL; 1288 1289 *lowp = data; 1290 *highp = data >> 32; 1291 return 0; 1292 } 1293 1294 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1295 { 1296 const u64 feature_and_reserved_bits = 1297 /* feature */ 1298 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1299 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1300 /* reserved */ 1301 GENMASK_ULL(13, 9) | BIT_ULL(31); 1302 u64 vmx_misc; 1303 1304 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1305 vmx->nested.msrs.misc_high); 1306 1307 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1308 return -EINVAL; 1309 1310 if ((vmx->nested.msrs.pinbased_ctls_high & 1311 PIN_BASED_VMX_PREEMPTION_TIMER) && 1312 vmx_misc_preemption_timer_rate(data) != 1313 vmx_misc_preemption_timer_rate(vmx_misc)) 1314 return -EINVAL; 1315 1316 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1317 return -EINVAL; 1318 1319 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1320 return -EINVAL; 1321 1322 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1323 return -EINVAL; 1324 1325 vmx->nested.msrs.misc_low = data; 1326 vmx->nested.msrs.misc_high = data >> 32; 1327 1328 return 0; 1329 } 1330 1331 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1332 { 1333 u64 vmx_ept_vpid_cap; 1334 1335 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1336 vmx->nested.msrs.vpid_caps); 1337 1338 /* Every bit is either reserved or a feature bit. */ 1339 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1340 return -EINVAL; 1341 1342 vmx->nested.msrs.ept_caps = data; 1343 vmx->nested.msrs.vpid_caps = data >> 32; 1344 return 0; 1345 } 1346 1347 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1348 { 1349 u64 *msr; 1350 1351 switch (msr_index) { 1352 case MSR_IA32_VMX_CR0_FIXED0: 1353 msr = &vmx->nested.msrs.cr0_fixed0; 1354 break; 1355 case MSR_IA32_VMX_CR4_FIXED0: 1356 msr = &vmx->nested.msrs.cr4_fixed0; 1357 break; 1358 default: 1359 BUG(); 1360 } 1361 1362 /* 1363 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1364 * must be 1 in the restored value. 1365 */ 1366 if (!is_bitwise_subset(data, *msr, -1ULL)) 1367 return -EINVAL; 1368 1369 *msr = data; 1370 return 0; 1371 } 1372 1373 /* 1374 * Called when userspace is restoring VMX MSRs. 1375 * 1376 * Returns 0 on success, non-0 otherwise. 1377 */ 1378 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1379 { 1380 struct vcpu_vmx *vmx = to_vmx(vcpu); 1381 1382 /* 1383 * Don't allow changes to the VMX capability MSRs while the vCPU 1384 * is in VMX operation. 1385 */ 1386 if (vmx->nested.vmxon) 1387 return -EBUSY; 1388 1389 switch (msr_index) { 1390 case MSR_IA32_VMX_BASIC: 1391 return vmx_restore_vmx_basic(vmx, data); 1392 case MSR_IA32_VMX_PINBASED_CTLS: 1393 case MSR_IA32_VMX_PROCBASED_CTLS: 1394 case MSR_IA32_VMX_EXIT_CTLS: 1395 case MSR_IA32_VMX_ENTRY_CTLS: 1396 /* 1397 * The "non-true" VMX capability MSRs are generated from the 1398 * "true" MSRs, so we do not support restoring them directly. 1399 * 1400 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1401 * should restore the "true" MSRs with the must-be-1 bits 1402 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1403 * DEFAULT SETTINGS". 1404 */ 1405 return -EINVAL; 1406 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1407 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1408 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1409 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1410 case MSR_IA32_VMX_PROCBASED_CTLS2: 1411 return vmx_restore_control_msr(vmx, msr_index, data); 1412 case MSR_IA32_VMX_MISC: 1413 return vmx_restore_vmx_misc(vmx, data); 1414 case MSR_IA32_VMX_CR0_FIXED0: 1415 case MSR_IA32_VMX_CR4_FIXED0: 1416 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1417 case MSR_IA32_VMX_CR0_FIXED1: 1418 case MSR_IA32_VMX_CR4_FIXED1: 1419 /* 1420 * These MSRs are generated based on the vCPU's CPUID, so we 1421 * do not support restoring them directly. 1422 */ 1423 return -EINVAL; 1424 case MSR_IA32_VMX_EPT_VPID_CAP: 1425 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1426 case MSR_IA32_VMX_VMCS_ENUM: 1427 vmx->nested.msrs.vmcs_enum = data; 1428 return 0; 1429 case MSR_IA32_VMX_VMFUNC: 1430 if (data & ~vmx->nested.msrs.vmfunc_controls) 1431 return -EINVAL; 1432 vmx->nested.msrs.vmfunc_controls = data; 1433 return 0; 1434 default: 1435 /* 1436 * The rest of the VMX capability MSRs do not support restore. 1437 */ 1438 return -EINVAL; 1439 } 1440 } 1441 1442 /* Returns 0 on success, non-0 otherwise. */ 1443 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1444 { 1445 switch (msr_index) { 1446 case MSR_IA32_VMX_BASIC: 1447 *pdata = msrs->basic; 1448 break; 1449 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1450 case MSR_IA32_VMX_PINBASED_CTLS: 1451 *pdata = vmx_control_msr( 1452 msrs->pinbased_ctls_low, 1453 msrs->pinbased_ctls_high); 1454 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1455 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1456 break; 1457 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1458 case MSR_IA32_VMX_PROCBASED_CTLS: 1459 *pdata = vmx_control_msr( 1460 msrs->procbased_ctls_low, 1461 msrs->procbased_ctls_high); 1462 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1463 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1464 break; 1465 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1466 case MSR_IA32_VMX_EXIT_CTLS: 1467 *pdata = vmx_control_msr( 1468 msrs->exit_ctls_low, 1469 msrs->exit_ctls_high); 1470 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1471 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1472 break; 1473 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1474 case MSR_IA32_VMX_ENTRY_CTLS: 1475 *pdata = vmx_control_msr( 1476 msrs->entry_ctls_low, 1477 msrs->entry_ctls_high); 1478 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1479 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1480 break; 1481 case MSR_IA32_VMX_MISC: 1482 *pdata = vmx_control_msr( 1483 msrs->misc_low, 1484 msrs->misc_high); 1485 break; 1486 case MSR_IA32_VMX_CR0_FIXED0: 1487 *pdata = msrs->cr0_fixed0; 1488 break; 1489 case MSR_IA32_VMX_CR0_FIXED1: 1490 *pdata = msrs->cr0_fixed1; 1491 break; 1492 case MSR_IA32_VMX_CR4_FIXED0: 1493 *pdata = msrs->cr4_fixed0; 1494 break; 1495 case MSR_IA32_VMX_CR4_FIXED1: 1496 *pdata = msrs->cr4_fixed1; 1497 break; 1498 case MSR_IA32_VMX_VMCS_ENUM: 1499 *pdata = msrs->vmcs_enum; 1500 break; 1501 case MSR_IA32_VMX_PROCBASED_CTLS2: 1502 *pdata = vmx_control_msr( 1503 msrs->secondary_ctls_low, 1504 msrs->secondary_ctls_high); 1505 break; 1506 case MSR_IA32_VMX_EPT_VPID_CAP: 1507 *pdata = msrs->ept_caps | 1508 ((u64)msrs->vpid_caps << 32); 1509 break; 1510 case MSR_IA32_VMX_VMFUNC: 1511 *pdata = msrs->vmfunc_controls; 1512 break; 1513 default: 1514 return 1; 1515 } 1516 1517 return 0; 1518 } 1519 1520 /* 1521 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1522 * been modified by the L1 guest. Note, "writable" in this context means 1523 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1524 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1525 * VM-exit information fields (which are actually writable if the vCPU is 1526 * configured to support "VMWRITE to any supported field in the VMCS"). 1527 */ 1528 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1529 { 1530 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1531 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1532 struct shadow_vmcs_field field; 1533 unsigned long val; 1534 int i; 1535 1536 if (WARN_ON(!shadow_vmcs)) 1537 return; 1538 1539 preempt_disable(); 1540 1541 vmcs_load(shadow_vmcs); 1542 1543 for (i = 0; i < max_shadow_read_write_fields; i++) { 1544 field = shadow_read_write_fields[i]; 1545 val = __vmcs_readl(field.encoding); 1546 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1547 } 1548 1549 vmcs_clear(shadow_vmcs); 1550 vmcs_load(vmx->loaded_vmcs->vmcs); 1551 1552 preempt_enable(); 1553 } 1554 1555 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1556 { 1557 const struct shadow_vmcs_field *fields[] = { 1558 shadow_read_write_fields, 1559 shadow_read_only_fields 1560 }; 1561 const int max_fields[] = { 1562 max_shadow_read_write_fields, 1563 max_shadow_read_only_fields 1564 }; 1565 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1566 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1567 struct shadow_vmcs_field field; 1568 unsigned long val; 1569 int i, q; 1570 1571 if (WARN_ON(!shadow_vmcs)) 1572 return; 1573 1574 vmcs_load(shadow_vmcs); 1575 1576 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1577 for (i = 0; i < max_fields[q]; i++) { 1578 field = fields[q][i]; 1579 val = vmcs12_read_any(vmcs12, field.encoding, 1580 field.offset); 1581 __vmcs_writel(field.encoding, val); 1582 } 1583 } 1584 1585 vmcs_clear(shadow_vmcs); 1586 vmcs_load(vmx->loaded_vmcs->vmcs); 1587 } 1588 1589 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) 1590 { 1591 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1592 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1593 1594 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1595 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1596 vmcs12->guest_rip = evmcs->guest_rip; 1597 1598 if (unlikely(!(evmcs->hv_clean_fields & 1599 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1600 vmcs12->guest_rsp = evmcs->guest_rsp; 1601 vmcs12->guest_rflags = evmcs->guest_rflags; 1602 vmcs12->guest_interruptibility_info = 1603 evmcs->guest_interruptibility_info; 1604 } 1605 1606 if (unlikely(!(evmcs->hv_clean_fields & 1607 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1608 vmcs12->cpu_based_vm_exec_control = 1609 evmcs->cpu_based_vm_exec_control; 1610 } 1611 1612 if (unlikely(!(evmcs->hv_clean_fields & 1613 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1614 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1615 } 1616 1617 if (unlikely(!(evmcs->hv_clean_fields & 1618 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1619 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1620 } 1621 1622 if (unlikely(!(evmcs->hv_clean_fields & 1623 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1624 vmcs12->vm_entry_intr_info_field = 1625 evmcs->vm_entry_intr_info_field; 1626 vmcs12->vm_entry_exception_error_code = 1627 evmcs->vm_entry_exception_error_code; 1628 vmcs12->vm_entry_instruction_len = 1629 evmcs->vm_entry_instruction_len; 1630 } 1631 1632 if (unlikely(!(evmcs->hv_clean_fields & 1633 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1634 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1635 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1636 vmcs12->host_cr0 = evmcs->host_cr0; 1637 vmcs12->host_cr3 = evmcs->host_cr3; 1638 vmcs12->host_cr4 = evmcs->host_cr4; 1639 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1640 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1641 vmcs12->host_rip = evmcs->host_rip; 1642 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1643 vmcs12->host_es_selector = evmcs->host_es_selector; 1644 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1645 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1646 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1647 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1648 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1649 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1650 } 1651 1652 if (unlikely(!(evmcs->hv_clean_fields & 1653 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1654 vmcs12->pin_based_vm_exec_control = 1655 evmcs->pin_based_vm_exec_control; 1656 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1657 vmcs12->secondary_vm_exec_control = 1658 evmcs->secondary_vm_exec_control; 1659 } 1660 1661 if (unlikely(!(evmcs->hv_clean_fields & 1662 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1663 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1664 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1665 } 1666 1667 if (unlikely(!(evmcs->hv_clean_fields & 1668 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1669 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1670 } 1671 1672 if (unlikely(!(evmcs->hv_clean_fields & 1673 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1674 vmcs12->guest_es_base = evmcs->guest_es_base; 1675 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1676 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1677 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1678 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1679 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1680 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1681 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1682 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1683 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1684 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1685 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1686 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1687 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1688 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1689 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1690 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1691 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1692 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1693 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1694 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1695 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1696 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1697 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1698 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1699 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1700 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1701 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1702 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1703 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1704 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1705 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1706 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1707 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1708 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1709 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1710 } 1711 1712 if (unlikely(!(evmcs->hv_clean_fields & 1713 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1714 vmcs12->tsc_offset = evmcs->tsc_offset; 1715 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1716 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1717 } 1718 1719 if (unlikely(!(evmcs->hv_clean_fields & 1720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1721 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1722 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1723 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1724 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1725 vmcs12->guest_cr0 = evmcs->guest_cr0; 1726 vmcs12->guest_cr3 = evmcs->guest_cr3; 1727 vmcs12->guest_cr4 = evmcs->guest_cr4; 1728 vmcs12->guest_dr7 = evmcs->guest_dr7; 1729 } 1730 1731 if (unlikely(!(evmcs->hv_clean_fields & 1732 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1733 vmcs12->host_fs_base = evmcs->host_fs_base; 1734 vmcs12->host_gs_base = evmcs->host_gs_base; 1735 vmcs12->host_tr_base = evmcs->host_tr_base; 1736 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1737 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1738 vmcs12->host_rsp = evmcs->host_rsp; 1739 } 1740 1741 if (unlikely(!(evmcs->hv_clean_fields & 1742 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1743 vmcs12->ept_pointer = evmcs->ept_pointer; 1744 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1745 } 1746 1747 if (unlikely(!(evmcs->hv_clean_fields & 1748 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1749 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1750 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1751 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1752 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1753 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1754 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1755 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1756 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1757 vmcs12->guest_pending_dbg_exceptions = 1758 evmcs->guest_pending_dbg_exceptions; 1759 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1760 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1761 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1762 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1763 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1764 } 1765 1766 /* 1767 * Not used? 1768 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1769 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1770 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1771 * vmcs12->page_fault_error_code_mask = 1772 * evmcs->page_fault_error_code_mask; 1773 * vmcs12->page_fault_error_code_match = 1774 * evmcs->page_fault_error_code_match; 1775 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1776 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1777 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1778 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1779 */ 1780 1781 /* 1782 * Read only fields: 1783 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1784 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1785 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1786 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1787 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1788 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1789 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1790 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1791 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1792 * vmcs12->exit_qualification = evmcs->exit_qualification; 1793 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1794 * 1795 * Not present in struct vmcs12: 1796 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1797 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1798 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1799 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1800 */ 1801 1802 return 0; 1803 } 1804 1805 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1806 { 1807 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1808 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1809 1810 /* 1811 * Should not be changed by KVM: 1812 * 1813 * evmcs->host_es_selector = vmcs12->host_es_selector; 1814 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1815 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1816 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1817 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1818 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1819 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1820 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1821 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1822 * evmcs->host_cr0 = vmcs12->host_cr0; 1823 * evmcs->host_cr3 = vmcs12->host_cr3; 1824 * evmcs->host_cr4 = vmcs12->host_cr4; 1825 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1826 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1827 * evmcs->host_rip = vmcs12->host_rip; 1828 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1829 * evmcs->host_fs_base = vmcs12->host_fs_base; 1830 * evmcs->host_gs_base = vmcs12->host_gs_base; 1831 * evmcs->host_tr_base = vmcs12->host_tr_base; 1832 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1833 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1834 * evmcs->host_rsp = vmcs12->host_rsp; 1835 * sync_vmcs02_to_vmcs12() doesn't read these: 1836 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1837 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1838 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1839 * evmcs->ept_pointer = vmcs12->ept_pointer; 1840 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1841 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1842 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1843 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1844 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1845 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1846 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1847 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1848 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1849 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1850 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1851 * evmcs->page_fault_error_code_mask = 1852 * vmcs12->page_fault_error_code_mask; 1853 * evmcs->page_fault_error_code_match = 1854 * vmcs12->page_fault_error_code_match; 1855 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1856 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1857 * evmcs->tsc_offset = vmcs12->tsc_offset; 1858 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1859 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1860 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1861 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1862 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1863 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1864 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1865 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1866 * 1867 * Not present in struct vmcs12: 1868 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1869 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1870 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1871 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1872 */ 1873 1874 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1875 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1876 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1877 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1878 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1879 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1880 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1881 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1882 1883 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1884 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1885 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1886 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1887 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1888 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1889 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1890 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1891 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1892 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1893 1894 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1895 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1896 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1897 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1898 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1899 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1900 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1901 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1902 1903 evmcs->guest_es_base = vmcs12->guest_es_base; 1904 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1905 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1906 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1907 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1908 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1909 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1910 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1911 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1912 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1913 1914 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1915 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1916 1917 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1918 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1919 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1920 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1921 1922 evmcs->guest_pending_dbg_exceptions = 1923 vmcs12->guest_pending_dbg_exceptions; 1924 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1925 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1926 1927 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1928 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1929 1930 evmcs->guest_cr0 = vmcs12->guest_cr0; 1931 evmcs->guest_cr3 = vmcs12->guest_cr3; 1932 evmcs->guest_cr4 = vmcs12->guest_cr4; 1933 evmcs->guest_dr7 = vmcs12->guest_dr7; 1934 1935 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1936 1937 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1938 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1939 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1940 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1941 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1942 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1943 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1944 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1945 1946 evmcs->exit_qualification = vmcs12->exit_qualification; 1947 1948 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1949 evmcs->guest_rsp = vmcs12->guest_rsp; 1950 evmcs->guest_rflags = vmcs12->guest_rflags; 1951 1952 evmcs->guest_interruptibility_info = 1953 vmcs12->guest_interruptibility_info; 1954 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1955 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1956 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1957 evmcs->vm_entry_exception_error_code = 1958 vmcs12->vm_entry_exception_error_code; 1959 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1960 1961 evmcs->guest_rip = vmcs12->guest_rip; 1962 1963 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1964 1965 return 0; 1966 } 1967 1968 /* 1969 * This is an equivalent of the nested hypervisor executing the vmptrld 1970 * instruction. 1971 */ 1972 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1973 struct kvm_vcpu *vcpu, bool from_launch) 1974 { 1975 struct vcpu_vmx *vmx = to_vmx(vcpu); 1976 bool evmcs_gpa_changed = false; 1977 u64 evmcs_gpa; 1978 1979 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1980 return EVMPTRLD_DISABLED; 1981 1982 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) 1983 return EVMPTRLD_DISABLED; 1984 1985 if (unlikely(!vmx->nested.hv_evmcs || 1986 evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1987 if (!vmx->nested.hv_evmcs) 1988 vmx->nested.current_vmptr = -1ull; 1989 1990 nested_release_evmcs(vcpu); 1991 1992 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1993 &vmx->nested.hv_evmcs_map)) 1994 return EVMPTRLD_ERROR; 1995 1996 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1997 1998 /* 1999 * Currently, KVM only supports eVMCS version 1 2000 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2001 * value to first u32 field of eVMCS which should specify eVMCS 2002 * VersionNumber. 2003 * 2004 * Guest should be aware of supported eVMCS versions by host by 2005 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2006 * expected to set this CPUID leaf according to the value 2007 * returned in vmcs_version from nested_enable_evmcs(). 2008 * 2009 * However, it turns out that Microsoft Hyper-V fails to comply 2010 * to their own invented interface: When Hyper-V use eVMCS, it 2011 * just sets first u32 field of eVMCS to revision_id specified 2012 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2013 * which is one of the supported versions specified in 2014 * CPUID.0x4000000A.EAX[0:15]. 2015 * 2016 * To overcome Hyper-V bug, we accept here either a supported 2017 * eVMCS version or VMCS12 revision_id as valid values for first 2018 * u32 field of eVMCS. 2019 */ 2020 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2021 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2022 nested_release_evmcs(vcpu); 2023 return EVMPTRLD_VMFAIL; 2024 } 2025 2026 vmx->nested.dirty_vmcs12 = true; 2027 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2028 2029 evmcs_gpa_changed = true; 2030 /* 2031 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2032 * reloaded from guest's memory (read only fields, fields not 2033 * present in struct hv_enlightened_vmcs, ...). Make sure there 2034 * are no leftovers. 2035 */ 2036 if (from_launch) { 2037 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2038 memset(vmcs12, 0, sizeof(*vmcs12)); 2039 vmcs12->hdr.revision_id = VMCS12_REVISION; 2040 } 2041 2042 } 2043 2044 /* 2045 * Clean fields data can't be used on VMLAUNCH and when we switch 2046 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2047 */ 2048 if (from_launch || evmcs_gpa_changed) 2049 vmx->nested.hv_evmcs->hv_clean_fields &= 2050 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2051 2052 return EVMPTRLD_SUCCEEDED; 2053 } 2054 2055 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2056 { 2057 struct vcpu_vmx *vmx = to_vmx(vcpu); 2058 2059 if (vmx->nested.hv_evmcs) { 2060 copy_vmcs12_to_enlightened(vmx); 2061 /* All fields are clean */ 2062 vmx->nested.hv_evmcs->hv_clean_fields |= 2063 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2064 } else { 2065 copy_vmcs12_to_shadow(vmx); 2066 } 2067 2068 vmx->nested.need_vmcs12_to_shadow_sync = false; 2069 } 2070 2071 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2072 { 2073 struct vcpu_vmx *vmx = 2074 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2075 2076 vmx->nested.preemption_timer_expired = true; 2077 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2078 kvm_vcpu_kick(&vmx->vcpu); 2079 2080 return HRTIMER_NORESTART; 2081 } 2082 2083 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2084 { 2085 struct vcpu_vmx *vmx = to_vmx(vcpu); 2086 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2087 2088 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2089 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2090 2091 if (!vmx->nested.has_preemption_timer_deadline) { 2092 vmx->nested.preemption_timer_deadline = 2093 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2094 vmx->nested.has_preemption_timer_deadline = true; 2095 } 2096 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2097 } 2098 2099 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2100 u64 preemption_timeout) 2101 { 2102 struct vcpu_vmx *vmx = to_vmx(vcpu); 2103 2104 /* 2105 * A timer value of zero is architecturally guaranteed to cause 2106 * a VMExit prior to executing any instructions in the guest. 2107 */ 2108 if (preemption_timeout == 0) { 2109 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2110 return; 2111 } 2112 2113 if (vcpu->arch.virtual_tsc_khz == 0) 2114 return; 2115 2116 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2117 preemption_timeout *= 1000000; 2118 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2119 hrtimer_start(&vmx->nested.preemption_timer, 2120 ktime_add_ns(ktime_get(), preemption_timeout), 2121 HRTIMER_MODE_ABS_PINNED); 2122 } 2123 2124 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2125 { 2126 if (vmx->nested.nested_run_pending && 2127 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2128 return vmcs12->guest_ia32_efer; 2129 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2130 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2131 else 2132 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2133 } 2134 2135 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2136 { 2137 /* 2138 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2139 * according to L0's settings (vmcs12 is irrelevant here). Host 2140 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2141 * will be set as needed prior to VMLAUNCH/VMRESUME. 2142 */ 2143 if (vmx->nested.vmcs02_initialized) 2144 return; 2145 vmx->nested.vmcs02_initialized = true; 2146 2147 /* 2148 * We don't care what the EPTP value is we just need to guarantee 2149 * it's valid so we don't get a false positive when doing early 2150 * consistency checks. 2151 */ 2152 if (enable_ept && nested_early_check) 2153 vmcs_write64(EPT_POINTER, 2154 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2155 2156 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2157 if (cpu_has_vmx_vmfunc()) 2158 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2159 2160 if (cpu_has_vmx_posted_intr()) 2161 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2162 2163 if (cpu_has_vmx_msr_bitmap()) 2164 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2165 2166 /* 2167 * PML is emulated for L2, but never enabled in hardware as the MMU 2168 * handles A/D emulation. Disabling PML for L2 also avoids having to 2169 * deal with filtering out L2 GPAs from the buffer. 2170 */ 2171 if (enable_pml) { 2172 vmcs_write64(PML_ADDRESS, 0); 2173 vmcs_write16(GUEST_PML_INDEX, -1); 2174 } 2175 2176 if (cpu_has_vmx_encls_vmexit()) 2177 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 2178 2179 /* 2180 * Set the MSR load/store lists to match L0's settings. Only the 2181 * addresses are constant (for vmcs02), the counts can change based 2182 * on L2's behavior, e.g. switching to/from long mode. 2183 */ 2184 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2185 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2186 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2187 2188 vmx_set_constant_host_state(vmx); 2189 } 2190 2191 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2192 struct vmcs12 *vmcs12) 2193 { 2194 prepare_vmcs02_constant_state(vmx); 2195 2196 vmcs_write64(VMCS_LINK_POINTER, -1ull); 2197 2198 if (enable_vpid) { 2199 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2200 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2201 else 2202 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2203 } 2204 } 2205 2206 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2207 { 2208 u32 exec_control; 2209 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2210 2211 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) 2212 prepare_vmcs02_early_rare(vmx, vmcs12); 2213 2214 /* 2215 * PIN CONTROLS 2216 */ 2217 exec_control = vmx_pin_based_exec_ctrl(vmx); 2218 exec_control |= (vmcs12->pin_based_vm_exec_control & 2219 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2220 2221 /* Posted interrupts setting is only taken from vmcs12. */ 2222 if (nested_cpu_has_posted_intr(vmcs12)) { 2223 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2224 vmx->nested.pi_pending = false; 2225 } else { 2226 exec_control &= ~PIN_BASED_POSTED_INTR; 2227 } 2228 pin_controls_set(vmx, exec_control); 2229 2230 /* 2231 * EXEC CONTROLS 2232 */ 2233 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2234 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2235 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2236 exec_control &= ~CPU_BASED_TPR_SHADOW; 2237 exec_control |= vmcs12->cpu_based_vm_exec_control; 2238 2239 vmx->nested.l1_tpr_threshold = -1; 2240 if (exec_control & CPU_BASED_TPR_SHADOW) 2241 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2242 #ifdef CONFIG_X86_64 2243 else 2244 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2245 CPU_BASED_CR8_STORE_EXITING; 2246 #endif 2247 2248 /* 2249 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2250 * for I/O port accesses. 2251 */ 2252 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2253 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2254 2255 /* 2256 * This bit will be computed in nested_get_vmcs12_pages, because 2257 * we do not have access to L1's MSR bitmap yet. For now, keep 2258 * the same bit as before, hoping to avoid multiple VMWRITEs that 2259 * only set/clear this bit. 2260 */ 2261 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2262 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2263 2264 exec_controls_set(vmx, exec_control); 2265 2266 /* 2267 * SECONDARY EXEC CONTROLS 2268 */ 2269 if (cpu_has_secondary_exec_ctrls()) { 2270 exec_control = vmx->secondary_exec_control; 2271 2272 /* Take the following fields only from vmcs12 */ 2273 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2274 SECONDARY_EXEC_ENABLE_INVPCID | 2275 SECONDARY_EXEC_ENABLE_RDTSCP | 2276 SECONDARY_EXEC_XSAVES | 2277 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2278 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2279 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2280 SECONDARY_EXEC_ENABLE_VMFUNC); 2281 if (nested_cpu_has(vmcs12, 2282 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2283 exec_control |= vmcs12->secondary_vm_exec_control; 2284 2285 /* PML is emulated and never enabled in hardware for L2. */ 2286 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2287 2288 /* VMCS shadowing for L2 is emulated for now */ 2289 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2290 2291 /* 2292 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2293 * will not have to rewrite the controls just for this bit. 2294 */ 2295 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2296 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2297 exec_control |= SECONDARY_EXEC_DESC; 2298 2299 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2300 vmcs_write16(GUEST_INTR_STATUS, 2301 vmcs12->guest_intr_status); 2302 2303 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2304 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2305 2306 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2307 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2308 2309 secondary_exec_controls_set(vmx, exec_control); 2310 } 2311 2312 /* 2313 * ENTRY CONTROLS 2314 * 2315 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2316 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2317 * on the related bits (if supported by the CPU) in the hope that 2318 * we can avoid VMWrites during vmx_set_efer(). 2319 */ 2320 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & 2321 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; 2322 if (cpu_has_load_ia32_efer()) { 2323 if (guest_efer & EFER_LMA) 2324 exec_control |= VM_ENTRY_IA32E_MODE; 2325 if (guest_efer != host_efer) 2326 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2327 } 2328 vm_entry_controls_set(vmx, exec_control); 2329 2330 /* 2331 * EXIT CONTROLS 2332 * 2333 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2334 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2335 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2336 */ 2337 exec_control = vmx_vmexit_ctrl(); 2338 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2339 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2340 vm_exit_controls_set(vmx, exec_control); 2341 2342 /* 2343 * Interrupt/Exception Fields 2344 */ 2345 if (vmx->nested.nested_run_pending) { 2346 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2347 vmcs12->vm_entry_intr_info_field); 2348 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2349 vmcs12->vm_entry_exception_error_code); 2350 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2351 vmcs12->vm_entry_instruction_len); 2352 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2353 vmcs12->guest_interruptibility_info); 2354 vmx->loaded_vmcs->nmi_known_unmasked = 2355 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2356 } else { 2357 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2358 } 2359 } 2360 2361 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2362 { 2363 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2364 2365 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2366 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2367 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2368 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2369 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2370 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2371 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2372 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2373 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2374 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2375 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2376 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2377 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2378 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2379 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2380 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2381 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2382 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2383 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2384 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2385 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2386 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2387 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2388 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2389 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2390 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2391 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2392 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2393 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2394 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2395 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2396 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2397 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2398 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2399 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2400 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2401 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2402 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2403 2404 vmx->segment_cache.bitmask = 0; 2405 } 2406 2407 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2408 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2409 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2410 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2411 vmcs12->guest_pending_dbg_exceptions); 2412 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2413 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2414 2415 /* 2416 * L1 may access the L2's PDPTR, so save them to construct 2417 * vmcs12 2418 */ 2419 if (enable_ept) { 2420 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2421 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2422 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2423 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2424 } 2425 2426 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2427 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2428 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2429 } 2430 2431 if (nested_cpu_has_xsaves(vmcs12)) 2432 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2433 2434 /* 2435 * Whether page-faults are trapped is determined by a combination of 2436 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2437 * doesn't care about page faults then we should set all of these to 2438 * L1's desires. However, if L0 does care about (some) page faults, it 2439 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2440 * simply ask to exit on each and every L2 page fault. This is done by 2441 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2442 * Note that below we don't need special code to set EB.PF beyond the 2443 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2444 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2445 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2446 */ 2447 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2448 /* 2449 * TODO: if both L0 and L1 need the same MASK and MATCH, 2450 * go ahead and use it? 2451 */ 2452 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2453 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2454 } else { 2455 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2456 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2457 } 2458 2459 if (cpu_has_vmx_apicv()) { 2460 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2461 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2462 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2463 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2464 } 2465 2466 /* 2467 * Make sure the msr_autostore list is up to date before we set the 2468 * count in the vmcs02. 2469 */ 2470 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2471 2472 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2473 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2474 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2475 2476 set_cr4_guest_host_mask(vmx); 2477 } 2478 2479 /* 2480 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2481 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2482 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2483 * guest in a way that will both be appropriate to L1's requests, and our 2484 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2485 * function also has additional necessary side-effects, like setting various 2486 * vcpu->arch fields. 2487 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2488 * is assigned to entry_failure_code on failure. 2489 */ 2490 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2491 enum vm_entry_failure_code *entry_failure_code) 2492 { 2493 struct vcpu_vmx *vmx = to_vmx(vcpu); 2494 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2495 bool load_guest_pdptrs_vmcs12 = false; 2496 2497 if (vmx->nested.dirty_vmcs12 || hv_evmcs) { 2498 prepare_vmcs02_rare(vmx, vmcs12); 2499 vmx->nested.dirty_vmcs12 = false; 2500 2501 load_guest_pdptrs_vmcs12 = !hv_evmcs || 2502 !(hv_evmcs->hv_clean_fields & 2503 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2504 } 2505 2506 if (vmx->nested.nested_run_pending && 2507 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2508 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2509 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2510 } else { 2511 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2512 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2513 } 2514 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2515 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2516 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2517 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2518 2519 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2520 * bitwise-or of what L1 wants to trap for L2, and what we want to 2521 * trap. Note that CR0.TS also needs updating - we do this later. 2522 */ 2523 vmx_update_exception_bitmap(vcpu); 2524 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2525 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2526 2527 if (vmx->nested.nested_run_pending && 2528 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2529 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2530 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2531 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2532 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2533 } 2534 2535 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2536 2537 if (kvm_has_tsc_control) 2538 decache_tsc_multiplier(vmx); 2539 2540 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2541 2542 if (nested_cpu_has_ept(vmcs12)) 2543 nested_ept_init_mmu_context(vcpu); 2544 2545 /* 2546 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2547 * bits which we consider mandatory enabled. 2548 * The CR0_READ_SHADOW is what L2 should have expected to read given 2549 * the specifications by L1; It's not enough to take 2550 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2551 * have more bits than L1 expected. 2552 */ 2553 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2554 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2555 2556 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2557 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2558 2559 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2560 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2561 vmx_set_efer(vcpu, vcpu->arch.efer); 2562 2563 /* 2564 * Guest state is invalid and unrestricted guest is disabled, 2565 * which means L1 attempted VMEntry to L2 with invalid state. 2566 * Fail the VMEntry. 2567 */ 2568 if (CC(!vmx_guest_state_valid(vcpu))) { 2569 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2570 return -EINVAL; 2571 } 2572 2573 /* Shadow page tables on either EPT or shadow page tables. */ 2574 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2575 entry_failure_code)) 2576 return -EINVAL; 2577 2578 /* 2579 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2580 * on nested VM-Exit, which can occur without actually running L2 and 2581 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2582 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2583 * transition to HLT instead of running L2. 2584 */ 2585 if (enable_ept) 2586 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2587 2588 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2589 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2590 is_pae_paging(vcpu)) { 2591 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2592 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2593 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2594 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2595 } 2596 2597 if (!enable_ept) 2598 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2599 2600 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2601 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2602 vmcs12->guest_ia32_perf_global_ctrl))) 2603 return -EINVAL; 2604 2605 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2606 kvm_rip_write(vcpu, vmcs12->guest_rip); 2607 return 0; 2608 } 2609 2610 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2611 { 2612 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2613 nested_cpu_has_virtual_nmis(vmcs12))) 2614 return -EINVAL; 2615 2616 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2617 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2618 return -EINVAL; 2619 2620 return 0; 2621 } 2622 2623 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2624 { 2625 struct vcpu_vmx *vmx = to_vmx(vcpu); 2626 2627 /* Check for memory type validity */ 2628 switch (new_eptp & VMX_EPTP_MT_MASK) { 2629 case VMX_EPTP_MT_UC: 2630 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2631 return false; 2632 break; 2633 case VMX_EPTP_MT_WB: 2634 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2635 return false; 2636 break; 2637 default: 2638 return false; 2639 } 2640 2641 /* Page-walk levels validity. */ 2642 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2643 case VMX_EPTP_PWL_5: 2644 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2645 return false; 2646 break; 2647 case VMX_EPTP_PWL_4: 2648 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2649 return false; 2650 break; 2651 default: 2652 return false; 2653 } 2654 2655 /* Reserved bits should not be set */ 2656 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2657 return false; 2658 2659 /* AD, if set, should be supported */ 2660 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2661 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2662 return false; 2663 } 2664 2665 return true; 2666 } 2667 2668 /* 2669 * Checks related to VM-Execution Control Fields 2670 */ 2671 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2672 struct vmcs12 *vmcs12) 2673 { 2674 struct vcpu_vmx *vmx = to_vmx(vcpu); 2675 2676 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2677 vmx->nested.msrs.pinbased_ctls_low, 2678 vmx->nested.msrs.pinbased_ctls_high)) || 2679 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2680 vmx->nested.msrs.procbased_ctls_low, 2681 vmx->nested.msrs.procbased_ctls_high))) 2682 return -EINVAL; 2683 2684 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2685 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2686 vmx->nested.msrs.secondary_ctls_low, 2687 vmx->nested.msrs.secondary_ctls_high))) 2688 return -EINVAL; 2689 2690 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2691 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2692 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2693 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2694 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2695 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2696 nested_vmx_check_nmi_controls(vmcs12) || 2697 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2698 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2699 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2700 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2701 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2702 return -EINVAL; 2703 2704 if (!nested_cpu_has_preemption_timer(vmcs12) && 2705 nested_cpu_has_save_preemption_timer(vmcs12)) 2706 return -EINVAL; 2707 2708 if (nested_cpu_has_ept(vmcs12) && 2709 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2710 return -EINVAL; 2711 2712 if (nested_cpu_has_vmfunc(vmcs12)) { 2713 if (CC(vmcs12->vm_function_control & 2714 ~vmx->nested.msrs.vmfunc_controls)) 2715 return -EINVAL; 2716 2717 if (nested_cpu_has_eptp_switching(vmcs12)) { 2718 if (CC(!nested_cpu_has_ept(vmcs12)) || 2719 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2720 return -EINVAL; 2721 } 2722 } 2723 2724 return 0; 2725 } 2726 2727 /* 2728 * Checks related to VM-Exit Control Fields 2729 */ 2730 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2731 struct vmcs12 *vmcs12) 2732 { 2733 struct vcpu_vmx *vmx = to_vmx(vcpu); 2734 2735 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2736 vmx->nested.msrs.exit_ctls_low, 2737 vmx->nested.msrs.exit_ctls_high)) || 2738 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2739 return -EINVAL; 2740 2741 return 0; 2742 } 2743 2744 /* 2745 * Checks related to VM-Entry Control Fields 2746 */ 2747 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2748 struct vmcs12 *vmcs12) 2749 { 2750 struct vcpu_vmx *vmx = to_vmx(vcpu); 2751 2752 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2753 vmx->nested.msrs.entry_ctls_low, 2754 vmx->nested.msrs.entry_ctls_high))) 2755 return -EINVAL; 2756 2757 /* 2758 * From the Intel SDM, volume 3: 2759 * Fields relevant to VM-entry event injection must be set properly. 2760 * These fields are the VM-entry interruption-information field, the 2761 * VM-entry exception error code, and the VM-entry instruction length. 2762 */ 2763 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2764 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2765 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2766 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2767 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2768 bool should_have_error_code; 2769 bool urg = nested_cpu_has2(vmcs12, 2770 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2771 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2772 2773 /* VM-entry interruption-info field: interruption type */ 2774 if (CC(intr_type == INTR_TYPE_RESERVED) || 2775 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2776 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2777 return -EINVAL; 2778 2779 /* VM-entry interruption-info field: vector */ 2780 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2781 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2782 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2783 return -EINVAL; 2784 2785 /* VM-entry interruption-info field: deliver error code */ 2786 should_have_error_code = 2787 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2788 x86_exception_has_error_code(vector); 2789 if (CC(has_error_code != should_have_error_code)) 2790 return -EINVAL; 2791 2792 /* VM-entry exception error code */ 2793 if (CC(has_error_code && 2794 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2795 return -EINVAL; 2796 2797 /* VM-entry interruption-info field: reserved bits */ 2798 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2799 return -EINVAL; 2800 2801 /* VM-entry instruction length */ 2802 switch (intr_type) { 2803 case INTR_TYPE_SOFT_EXCEPTION: 2804 case INTR_TYPE_SOFT_INTR: 2805 case INTR_TYPE_PRIV_SW_EXCEPTION: 2806 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2807 CC(vmcs12->vm_entry_instruction_len == 0 && 2808 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2809 return -EINVAL; 2810 } 2811 } 2812 2813 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2814 return -EINVAL; 2815 2816 return 0; 2817 } 2818 2819 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2820 struct vmcs12 *vmcs12) 2821 { 2822 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2823 nested_check_vm_exit_controls(vcpu, vmcs12) || 2824 nested_check_vm_entry_controls(vcpu, vmcs12)) 2825 return -EINVAL; 2826 2827 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) 2828 return nested_evmcs_check_controls(vmcs12); 2829 2830 return 0; 2831 } 2832 2833 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2834 struct vmcs12 *vmcs12) 2835 { 2836 bool ia32e; 2837 2838 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2839 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2840 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2841 return -EINVAL; 2842 2843 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2844 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2845 return -EINVAL; 2846 2847 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2848 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2849 return -EINVAL; 2850 2851 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2852 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2853 vmcs12->host_ia32_perf_global_ctrl))) 2854 return -EINVAL; 2855 2856 #ifdef CONFIG_X86_64 2857 ia32e = !!(vcpu->arch.efer & EFER_LMA); 2858 #else 2859 ia32e = false; 2860 #endif 2861 2862 if (ia32e) { 2863 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || 2864 CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2865 return -EINVAL; 2866 } else { 2867 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || 2868 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2869 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2870 CC((vmcs12->host_rip) >> 32)) 2871 return -EINVAL; 2872 } 2873 2874 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2875 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2876 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2877 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2878 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2879 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2880 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2881 CC(vmcs12->host_cs_selector == 0) || 2882 CC(vmcs12->host_tr_selector == 0) || 2883 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2884 return -EINVAL; 2885 2886 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2887 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2888 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2889 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2890 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2891 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2892 return -EINVAL; 2893 2894 /* 2895 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2896 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2897 * the values of the LMA and LME bits in the field must each be that of 2898 * the host address-space size VM-exit control. 2899 */ 2900 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2901 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2902 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2903 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2904 return -EINVAL; 2905 } 2906 2907 return 0; 2908 } 2909 2910 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2911 struct vmcs12 *vmcs12) 2912 { 2913 int r = 0; 2914 struct vmcs12 *shadow; 2915 struct kvm_host_map map; 2916 2917 if (vmcs12->vmcs_link_pointer == -1ull) 2918 return 0; 2919 2920 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2921 return -EINVAL; 2922 2923 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2924 return -EINVAL; 2925 2926 shadow = map.hva; 2927 2928 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2929 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2930 r = -EINVAL; 2931 2932 kvm_vcpu_unmap(vcpu, &map, false); 2933 return r; 2934 } 2935 2936 /* 2937 * Checks related to Guest Non-register State 2938 */ 2939 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2940 { 2941 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2942 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 2943 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 2944 return -EINVAL; 2945 2946 return 0; 2947 } 2948 2949 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2950 struct vmcs12 *vmcs12, 2951 enum vm_entry_failure_code *entry_failure_code) 2952 { 2953 bool ia32e; 2954 2955 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2956 2957 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2958 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2959 return -EINVAL; 2960 2961 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 2962 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 2963 return -EINVAL; 2964 2965 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2966 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2967 return -EINVAL; 2968 2969 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2970 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 2971 return -EINVAL; 2972 } 2973 2974 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2975 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2976 vmcs12->guest_ia32_perf_global_ctrl))) 2977 return -EINVAL; 2978 2979 /* 2980 * If the load IA32_EFER VM-entry control is 1, the following checks 2981 * are performed on the field for the IA32_EFER MSR: 2982 * - Bits reserved in the IA32_EFER MSR must be 0. 2983 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2984 * the IA-32e mode guest VM-exit control. It must also be identical 2985 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2986 * CR0.PG) is 1. 2987 */ 2988 if (to_vmx(vcpu)->nested.nested_run_pending && 2989 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2990 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2991 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2992 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2993 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2994 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2995 return -EINVAL; 2996 } 2997 2998 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2999 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3000 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3001 return -EINVAL; 3002 3003 if (nested_check_guest_non_reg_state(vmcs12)) 3004 return -EINVAL; 3005 3006 return 0; 3007 } 3008 3009 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3010 { 3011 struct vcpu_vmx *vmx = to_vmx(vcpu); 3012 unsigned long cr3, cr4; 3013 bool vm_fail; 3014 3015 if (!nested_early_check) 3016 return 0; 3017 3018 if (vmx->msr_autoload.host.nr) 3019 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3020 if (vmx->msr_autoload.guest.nr) 3021 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3022 3023 preempt_disable(); 3024 3025 vmx_prepare_switch_to_guest(vcpu); 3026 3027 /* 3028 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3029 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3030 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3031 * there is no need to preserve other bits or save/restore the field. 3032 */ 3033 vmcs_writel(GUEST_RFLAGS, 0); 3034 3035 cr3 = __get_current_cr3_fast(); 3036 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3037 vmcs_writel(HOST_CR3, cr3); 3038 vmx->loaded_vmcs->host_state.cr3 = cr3; 3039 } 3040 3041 cr4 = cr4_read_shadow(); 3042 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3043 vmcs_writel(HOST_CR4, cr4); 3044 vmx->loaded_vmcs->host_state.cr4 = cr4; 3045 } 3046 3047 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3048 vmx->loaded_vmcs->launched); 3049 3050 if (vmx->msr_autoload.host.nr) 3051 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3052 if (vmx->msr_autoload.guest.nr) 3053 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3054 3055 if (vm_fail) { 3056 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3057 3058 preempt_enable(); 3059 3060 trace_kvm_nested_vmenter_failed( 3061 "early hardware check VM-instruction error: ", error); 3062 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3063 return 1; 3064 } 3065 3066 /* 3067 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3068 */ 3069 if (hw_breakpoint_active()) 3070 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3071 local_irq_enable(); 3072 preempt_enable(); 3073 3074 /* 3075 * A non-failing VMEntry means we somehow entered guest mode with 3076 * an illegal RIP, and that's just the tip of the iceberg. There 3077 * is no telling what memory has been modified or what state has 3078 * been exposed to unknown code. Hitting this all but guarantees 3079 * a (very critical) hardware issue. 3080 */ 3081 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3082 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3083 3084 return 0; 3085 } 3086 3087 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3088 { 3089 struct vcpu_vmx *vmx = to_vmx(vcpu); 3090 3091 /* 3092 * hv_evmcs may end up being not mapped after migration (when 3093 * L2 was running), map it here to make sure vmcs12 changes are 3094 * properly reflected. 3095 */ 3096 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) { 3097 enum nested_evmptrld_status evmptrld_status = 3098 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3099 3100 if (evmptrld_status == EVMPTRLD_VMFAIL || 3101 evmptrld_status == EVMPTRLD_ERROR) { 3102 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3103 __func__); 3104 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3105 vcpu->run->internal.suberror = 3106 KVM_INTERNAL_ERROR_EMULATION; 3107 vcpu->run->internal.ndata = 0; 3108 return false; 3109 } 3110 } 3111 3112 return true; 3113 } 3114 3115 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3116 { 3117 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3118 struct vcpu_vmx *vmx = to_vmx(vcpu); 3119 struct kvm_host_map *map; 3120 struct page *page; 3121 u64 hpa; 3122 3123 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3124 /* 3125 * Translate L1 physical address to host physical 3126 * address for vmcs02. Keep the page pinned, so this 3127 * physical address remains valid. We keep a reference 3128 * to it so we can release it later. 3129 */ 3130 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 3131 kvm_release_page_clean(vmx->nested.apic_access_page); 3132 vmx->nested.apic_access_page = NULL; 3133 } 3134 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 3135 if (!is_error_page(page)) { 3136 vmx->nested.apic_access_page = page; 3137 hpa = page_to_phys(vmx->nested.apic_access_page); 3138 vmcs_write64(APIC_ACCESS_ADDR, hpa); 3139 } else { 3140 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 3141 __func__); 3142 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3143 vcpu->run->internal.suberror = 3144 KVM_INTERNAL_ERROR_EMULATION; 3145 vcpu->run->internal.ndata = 0; 3146 return false; 3147 } 3148 } 3149 3150 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3151 map = &vmx->nested.virtual_apic_map; 3152 3153 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3154 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3155 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3156 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3157 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3158 /* 3159 * The processor will never use the TPR shadow, simply 3160 * clear the bit from the execution control. Such a 3161 * configuration is useless, but it happens in tests. 3162 * For any other configuration, failing the vm entry is 3163 * _not_ what the processor does but it's basically the 3164 * only possibility we have. 3165 */ 3166 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3167 } else { 3168 /* 3169 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3170 * force VM-Entry to fail. 3171 */ 3172 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 3173 } 3174 } 3175 3176 if (nested_cpu_has_posted_intr(vmcs12)) { 3177 map = &vmx->nested.pi_desc_map; 3178 3179 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3180 vmx->nested.pi_desc = 3181 (struct pi_desc *)(((void *)map->hva) + 3182 offset_in_page(vmcs12->posted_intr_desc_addr)); 3183 vmcs_write64(POSTED_INTR_DESC_ADDR, 3184 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3185 } 3186 } 3187 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3188 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3189 else 3190 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3191 3192 return true; 3193 } 3194 3195 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3196 { 3197 if (!nested_get_evmcs_page(vcpu)) 3198 return false; 3199 3200 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3201 return false; 3202 3203 return true; 3204 } 3205 3206 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3207 { 3208 struct vmcs12 *vmcs12; 3209 struct vcpu_vmx *vmx = to_vmx(vcpu); 3210 gpa_t dst; 3211 3212 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3213 return 0; 3214 3215 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3216 return 1; 3217 3218 /* 3219 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3220 * set is already checked as part of A/D emulation. 3221 */ 3222 vmcs12 = get_vmcs12(vcpu); 3223 if (!nested_cpu_has_pml(vmcs12)) 3224 return 0; 3225 3226 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3227 vmx->nested.pml_full = true; 3228 return 1; 3229 } 3230 3231 gpa &= ~0xFFFull; 3232 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3233 3234 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3235 offset_in_page(dst), sizeof(gpa))) 3236 return 0; 3237 3238 vmcs12->guest_pml_index--; 3239 3240 return 0; 3241 } 3242 3243 /* 3244 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3245 * for running VMX instructions (except VMXON, whose prerequisites are 3246 * slightly different). It also specifies what exception to inject otherwise. 3247 * Note that many of these exceptions have priority over VM exits, so they 3248 * don't have to be checked again here. 3249 */ 3250 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3251 { 3252 if (!to_vmx(vcpu)->nested.vmxon) { 3253 kvm_queue_exception(vcpu, UD_VECTOR); 3254 return 0; 3255 } 3256 3257 if (vmx_get_cpl(vcpu)) { 3258 kvm_inject_gp(vcpu, 0); 3259 return 0; 3260 } 3261 3262 return 1; 3263 } 3264 3265 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3266 { 3267 u8 rvi = vmx_get_rvi(); 3268 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3269 3270 return ((rvi & 0xf0) > (vppr & 0xf0)); 3271 } 3272 3273 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3274 struct vmcs12 *vmcs12); 3275 3276 /* 3277 * If from_vmentry is false, this is being called from state restore (either RSM 3278 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3279 * 3280 * Returns: 3281 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3282 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3283 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3284 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3285 */ 3286 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3287 bool from_vmentry) 3288 { 3289 struct vcpu_vmx *vmx = to_vmx(vcpu); 3290 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3291 enum vm_entry_failure_code entry_failure_code; 3292 bool evaluate_pending_interrupts; 3293 union vmx_exit_reason exit_reason = { 3294 .basic = EXIT_REASON_INVALID_STATE, 3295 .failed_vmentry = 1, 3296 }; 3297 u32 failed_index; 3298 3299 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 3300 kvm_vcpu_flush_tlb_current(vcpu); 3301 3302 evaluate_pending_interrupts = exec_controls_get(vmx) & 3303 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3304 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3305 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3306 3307 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3308 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3309 if (kvm_mpx_supported() && 3310 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3311 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3312 3313 /* 3314 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3315 * nested early checks are disabled. In the event of a "late" VM-Fail, 3316 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3317 * software model to the pre-VMEntry host state. When EPT is disabled, 3318 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3319 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3320 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3321 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3322 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3323 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3324 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3325 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3326 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3327 * path would need to manually save/restore vmcs01.GUEST_CR3. 3328 */ 3329 if (!enable_ept && !nested_early_check) 3330 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3331 3332 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3333 3334 prepare_vmcs02_early(vmx, vmcs12); 3335 3336 if (from_vmentry) { 3337 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3338 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3339 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3340 } 3341 3342 if (nested_vmx_check_vmentry_hw(vcpu)) { 3343 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3344 return NVMX_VMENTRY_VMFAIL; 3345 } 3346 3347 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3348 &entry_failure_code)) { 3349 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3350 vmcs12->exit_qualification = entry_failure_code; 3351 goto vmentry_fail_vmexit; 3352 } 3353 } 3354 3355 enter_guest_mode(vcpu); 3356 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3357 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 3358 3359 if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { 3360 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3361 vmcs12->exit_qualification = entry_failure_code; 3362 goto vmentry_fail_vmexit_guest_mode; 3363 } 3364 3365 if (from_vmentry) { 3366 failed_index = nested_vmx_load_msr(vcpu, 3367 vmcs12->vm_entry_msr_load_addr, 3368 vmcs12->vm_entry_msr_load_count); 3369 if (failed_index) { 3370 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3371 vmcs12->exit_qualification = failed_index; 3372 goto vmentry_fail_vmexit_guest_mode; 3373 } 3374 } else { 3375 /* 3376 * The MMU is not initialized to point at the right entities yet and 3377 * "get pages" would need to read data from the guest (i.e. we will 3378 * need to perform gpa to hpa translation). Request a call 3379 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3380 * have already been set at vmentry time and should not be reset. 3381 */ 3382 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3383 } 3384 3385 /* 3386 * If L1 had a pending IRQ/NMI until it executed 3387 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3388 * disallowed (e.g. interrupts disabled), L0 needs to 3389 * evaluate if this pending event should cause an exit from L2 3390 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3391 * intercept EXTERNAL_INTERRUPT). 3392 * 3393 * Usually this would be handled by the processor noticing an 3394 * IRQ/NMI window request, or checking RVI during evaluation of 3395 * pending virtual interrupts. However, this setting was done 3396 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3397 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3398 */ 3399 if (unlikely(evaluate_pending_interrupts)) 3400 kvm_make_request(KVM_REQ_EVENT, vcpu); 3401 3402 /* 3403 * Do not start the preemption timer hrtimer until after we know 3404 * we are successful, so that only nested_vmx_vmexit needs to cancel 3405 * the timer. 3406 */ 3407 vmx->nested.preemption_timer_expired = false; 3408 if (nested_cpu_has_preemption_timer(vmcs12)) { 3409 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3410 vmx_start_preemption_timer(vcpu, timer_value); 3411 } 3412 3413 /* 3414 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3415 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3416 * returned as far as L1 is concerned. It will only return (and set 3417 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3418 */ 3419 return NVMX_VMENTRY_SUCCESS; 3420 3421 /* 3422 * A failed consistency check that leads to a VMExit during L1's 3423 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3424 * 26.7 "VM-entry failures during or after loading guest state". 3425 */ 3426 vmentry_fail_vmexit_guest_mode: 3427 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3428 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3429 leave_guest_mode(vcpu); 3430 3431 vmentry_fail_vmexit: 3432 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3433 3434 if (!from_vmentry) 3435 return NVMX_VMENTRY_VMEXIT; 3436 3437 load_vmcs12_host_state(vcpu, vmcs12); 3438 vmcs12->vm_exit_reason = exit_reason.full; 3439 if (enable_shadow_vmcs || vmx->nested.hv_evmcs) 3440 vmx->nested.need_vmcs12_to_shadow_sync = true; 3441 return NVMX_VMENTRY_VMEXIT; 3442 } 3443 3444 /* 3445 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3446 * for running an L2 nested guest. 3447 */ 3448 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3449 { 3450 struct vmcs12 *vmcs12; 3451 enum nvmx_vmentry_status status; 3452 struct vcpu_vmx *vmx = to_vmx(vcpu); 3453 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3454 enum nested_evmptrld_status evmptrld_status; 3455 3456 ++vcpu->stat.nested_run; 3457 3458 if (!nested_vmx_check_permission(vcpu)) 3459 return 1; 3460 3461 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3462 if (evmptrld_status == EVMPTRLD_ERROR) { 3463 kvm_queue_exception(vcpu, UD_VECTOR); 3464 return 1; 3465 } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) { 3466 return nested_vmx_failInvalid(vcpu); 3467 } 3468 3469 if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)) 3470 return nested_vmx_failInvalid(vcpu); 3471 3472 vmcs12 = get_vmcs12(vcpu); 3473 3474 /* 3475 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3476 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3477 * rather than RFLAGS.ZF, and no error number is stored to the 3478 * VM-instruction error field. 3479 */ 3480 if (CC(vmcs12->hdr.shadow_vmcs)) 3481 return nested_vmx_failInvalid(vcpu); 3482 3483 if (vmx->nested.hv_evmcs) { 3484 copy_enlightened_to_vmcs12(vmx); 3485 /* Enlightened VMCS doesn't have launch state */ 3486 vmcs12->launch_state = !launch; 3487 } else if (enable_shadow_vmcs) { 3488 copy_shadow_to_vmcs12(vmx); 3489 } 3490 3491 /* 3492 * The nested entry process starts with enforcing various prerequisites 3493 * on vmcs12 as required by the Intel SDM, and act appropriately when 3494 * they fail: As the SDM explains, some conditions should cause the 3495 * instruction to fail, while others will cause the instruction to seem 3496 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3497 * To speed up the normal (success) code path, we should avoid checking 3498 * for misconfigurations which will anyway be caught by the processor 3499 * when using the merged vmcs02. 3500 */ 3501 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3502 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3503 3504 if (CC(vmcs12->launch_state == launch)) 3505 return nested_vmx_fail(vcpu, 3506 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3507 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3508 3509 if (nested_vmx_check_controls(vcpu, vmcs12)) 3510 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3511 3512 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3513 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3514 3515 /* 3516 * We're finally done with prerequisite checking, and can start with 3517 * the nested entry. 3518 */ 3519 vmx->nested.nested_run_pending = 1; 3520 vmx->nested.has_preemption_timer_deadline = false; 3521 status = nested_vmx_enter_non_root_mode(vcpu, true); 3522 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3523 goto vmentry_failed; 3524 3525 /* Emulate processing of posted interrupts on VM-Enter. */ 3526 if (nested_cpu_has_posted_intr(vmcs12) && 3527 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3528 vmx->nested.pi_pending = true; 3529 kvm_make_request(KVM_REQ_EVENT, vcpu); 3530 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3531 } 3532 3533 /* Hide L1D cache contents from the nested guest. */ 3534 vmx->vcpu.arch.l1tf_flush_l1d = true; 3535 3536 /* 3537 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3538 * also be used as part of restoring nVMX state for 3539 * snapshot restore (migration). 3540 * 3541 * In this flow, it is assumed that vmcs12 cache was 3542 * transferred as part of captured nVMX state and should 3543 * therefore not be read from guest memory (which may not 3544 * exist on destination host yet). 3545 */ 3546 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3547 3548 switch (vmcs12->guest_activity_state) { 3549 case GUEST_ACTIVITY_HLT: 3550 /* 3551 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3552 * awakened by event injection or by an NMI-window VM-exit or 3553 * by an interrupt-window VM-exit, halt the vcpu. 3554 */ 3555 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3556 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3557 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3558 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3559 vmx->nested.nested_run_pending = 0; 3560 return kvm_vcpu_halt(vcpu); 3561 } 3562 break; 3563 case GUEST_ACTIVITY_WAIT_SIPI: 3564 vmx->nested.nested_run_pending = 0; 3565 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3566 break; 3567 default: 3568 break; 3569 } 3570 3571 return 1; 3572 3573 vmentry_failed: 3574 vmx->nested.nested_run_pending = 0; 3575 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3576 return 0; 3577 if (status == NVMX_VMENTRY_VMEXIT) 3578 return 1; 3579 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3580 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3581 } 3582 3583 /* 3584 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3585 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3586 * This function returns the new value we should put in vmcs12.guest_cr0. 3587 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3588 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3589 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3590 * didn't trap the bit, because if L1 did, so would L0). 3591 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3592 * been modified by L2, and L1 knows it. So just leave the old value of 3593 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3594 * isn't relevant, because if L0 traps this bit it can set it to anything. 3595 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3596 * changed these bits, and therefore they need to be updated, but L0 3597 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3598 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3599 */ 3600 static inline unsigned long 3601 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3602 { 3603 return 3604 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3605 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3606 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3607 vcpu->arch.cr0_guest_owned_bits)); 3608 } 3609 3610 static inline unsigned long 3611 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3612 { 3613 return 3614 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3615 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3616 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3617 vcpu->arch.cr4_guest_owned_bits)); 3618 } 3619 3620 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3621 struct vmcs12 *vmcs12) 3622 { 3623 u32 idt_vectoring; 3624 unsigned int nr; 3625 3626 if (vcpu->arch.exception.injected) { 3627 nr = vcpu->arch.exception.nr; 3628 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3629 3630 if (kvm_exception_is_soft(nr)) { 3631 vmcs12->vm_exit_instruction_len = 3632 vcpu->arch.event_exit_inst_len; 3633 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3634 } else 3635 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3636 3637 if (vcpu->arch.exception.has_error_code) { 3638 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3639 vmcs12->idt_vectoring_error_code = 3640 vcpu->arch.exception.error_code; 3641 } 3642 3643 vmcs12->idt_vectoring_info_field = idt_vectoring; 3644 } else if (vcpu->arch.nmi_injected) { 3645 vmcs12->idt_vectoring_info_field = 3646 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3647 } else if (vcpu->arch.interrupt.injected) { 3648 nr = vcpu->arch.interrupt.nr; 3649 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3650 3651 if (vcpu->arch.interrupt.soft) { 3652 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3653 vmcs12->vm_entry_instruction_len = 3654 vcpu->arch.event_exit_inst_len; 3655 } else 3656 idt_vectoring |= INTR_TYPE_EXT_INTR; 3657 3658 vmcs12->idt_vectoring_info_field = idt_vectoring; 3659 } 3660 } 3661 3662 3663 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3664 { 3665 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3666 gfn_t gfn; 3667 3668 /* 3669 * Don't need to mark the APIC access page dirty; it is never 3670 * written to by the CPU during APIC virtualization. 3671 */ 3672 3673 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3674 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3675 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3676 } 3677 3678 if (nested_cpu_has_posted_intr(vmcs12)) { 3679 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3680 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3681 } 3682 } 3683 3684 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3685 { 3686 struct vcpu_vmx *vmx = to_vmx(vcpu); 3687 int max_irr; 3688 void *vapic_page; 3689 u16 status; 3690 3691 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) 3692 return; 3693 3694 vmx->nested.pi_pending = false; 3695 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3696 return; 3697 3698 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3699 if (max_irr != 256) { 3700 vapic_page = vmx->nested.virtual_apic_map.hva; 3701 if (!vapic_page) 3702 return; 3703 3704 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3705 vapic_page, &max_irr); 3706 status = vmcs_read16(GUEST_INTR_STATUS); 3707 if ((u8)max_irr > ((u8)status & 0xff)) { 3708 status &= ~0xff; 3709 status |= (u8)max_irr; 3710 vmcs_write16(GUEST_INTR_STATUS, status); 3711 } 3712 } 3713 3714 nested_mark_vmcs12_pages_dirty(vcpu); 3715 } 3716 3717 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3718 unsigned long exit_qual) 3719 { 3720 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3721 unsigned int nr = vcpu->arch.exception.nr; 3722 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3723 3724 if (vcpu->arch.exception.has_error_code) { 3725 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3726 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3727 } 3728 3729 if (kvm_exception_is_soft(nr)) 3730 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3731 else 3732 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3733 3734 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3735 vmx_get_nmi_mask(vcpu)) 3736 intr_info |= INTR_INFO_UNBLOCK_NMI; 3737 3738 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3739 } 3740 3741 /* 3742 * Returns true if a debug trap is pending delivery. 3743 * 3744 * In KVM, debug traps bear an exception payload. As such, the class of a #DB 3745 * exception may be inferred from the presence of an exception payload. 3746 */ 3747 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) 3748 { 3749 return vcpu->arch.exception.pending && 3750 vcpu->arch.exception.nr == DB_VECTOR && 3751 vcpu->arch.exception.payload; 3752 } 3753 3754 /* 3755 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3756 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3757 * represents these debug traps with a payload that is said to be compatible 3758 * with the 'pending debug exceptions' field, write the payload to the VMCS 3759 * field if a VM-exit is delivered before the debug trap. 3760 */ 3761 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3762 { 3763 if (vmx_pending_dbg_trap(vcpu)) 3764 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 3765 vcpu->arch.exception.payload); 3766 } 3767 3768 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3769 { 3770 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3771 to_vmx(vcpu)->nested.preemption_timer_expired; 3772 } 3773 3774 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 3775 { 3776 struct vcpu_vmx *vmx = to_vmx(vcpu); 3777 unsigned long exit_qual; 3778 bool block_nested_events = 3779 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3780 bool mtf_pending = vmx->nested.mtf_pending; 3781 struct kvm_lapic *apic = vcpu->arch.apic; 3782 3783 /* 3784 * Clear the MTF state. If a higher priority VM-exit is delivered first, 3785 * this state is discarded. 3786 */ 3787 if (!block_nested_events) 3788 vmx->nested.mtf_pending = false; 3789 3790 if (lapic_in_kernel(vcpu) && 3791 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3792 if (block_nested_events) 3793 return -EBUSY; 3794 nested_vmx_update_pending_dbg(vcpu); 3795 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3796 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 3797 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3798 return 0; 3799 } 3800 3801 if (lapic_in_kernel(vcpu) && 3802 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 3803 if (block_nested_events) 3804 return -EBUSY; 3805 3806 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 3807 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 3808 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 3809 apic->sipi_vector & 0xFFUL); 3810 return 0; 3811 } 3812 3813 /* 3814 * Process any exceptions that are not debug traps before MTF. 3815 * 3816 * Note that only a pending nested run can block a pending exception. 3817 * Otherwise an injected NMI/interrupt should either be 3818 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO, 3819 * while delivering the pending exception. 3820 */ 3821 3822 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { 3823 if (vmx->nested.nested_run_pending) 3824 return -EBUSY; 3825 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3826 goto no_vmexit; 3827 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3828 return 0; 3829 } 3830 3831 if (mtf_pending) { 3832 if (block_nested_events) 3833 return -EBUSY; 3834 nested_vmx_update_pending_dbg(vcpu); 3835 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 3836 return 0; 3837 } 3838 3839 if (vcpu->arch.exception.pending) { 3840 if (vmx->nested.nested_run_pending) 3841 return -EBUSY; 3842 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3843 goto no_vmexit; 3844 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3845 return 0; 3846 } 3847 3848 if (nested_vmx_preemption_timer_pending(vcpu)) { 3849 if (block_nested_events) 3850 return -EBUSY; 3851 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3852 return 0; 3853 } 3854 3855 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 3856 if (block_nested_events) 3857 return -EBUSY; 3858 goto no_vmexit; 3859 } 3860 3861 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 3862 if (block_nested_events) 3863 return -EBUSY; 3864 if (!nested_exit_on_nmi(vcpu)) 3865 goto no_vmexit; 3866 3867 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3868 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3869 INTR_INFO_VALID_MASK, 0); 3870 /* 3871 * The NMI-triggered VM exit counts as injection: 3872 * clear this one and block further NMIs. 3873 */ 3874 vcpu->arch.nmi_pending = 0; 3875 vmx_set_nmi_mask(vcpu, true); 3876 return 0; 3877 } 3878 3879 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 3880 if (block_nested_events) 3881 return -EBUSY; 3882 if (!nested_exit_on_intr(vcpu)) 3883 goto no_vmexit; 3884 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3885 return 0; 3886 } 3887 3888 no_vmexit: 3889 vmx_complete_nested_posted_interrupt(vcpu); 3890 return 0; 3891 } 3892 3893 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3894 { 3895 ktime_t remaining = 3896 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3897 u64 value; 3898 3899 if (ktime_to_ns(remaining) <= 0) 3900 return 0; 3901 3902 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3903 do_div(value, 1000000); 3904 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3905 } 3906 3907 static bool is_vmcs12_ext_field(unsigned long field) 3908 { 3909 switch (field) { 3910 case GUEST_ES_SELECTOR: 3911 case GUEST_CS_SELECTOR: 3912 case GUEST_SS_SELECTOR: 3913 case GUEST_DS_SELECTOR: 3914 case GUEST_FS_SELECTOR: 3915 case GUEST_GS_SELECTOR: 3916 case GUEST_LDTR_SELECTOR: 3917 case GUEST_TR_SELECTOR: 3918 case GUEST_ES_LIMIT: 3919 case GUEST_CS_LIMIT: 3920 case GUEST_SS_LIMIT: 3921 case GUEST_DS_LIMIT: 3922 case GUEST_FS_LIMIT: 3923 case GUEST_GS_LIMIT: 3924 case GUEST_LDTR_LIMIT: 3925 case GUEST_TR_LIMIT: 3926 case GUEST_GDTR_LIMIT: 3927 case GUEST_IDTR_LIMIT: 3928 case GUEST_ES_AR_BYTES: 3929 case GUEST_DS_AR_BYTES: 3930 case GUEST_FS_AR_BYTES: 3931 case GUEST_GS_AR_BYTES: 3932 case GUEST_LDTR_AR_BYTES: 3933 case GUEST_TR_AR_BYTES: 3934 case GUEST_ES_BASE: 3935 case GUEST_CS_BASE: 3936 case GUEST_SS_BASE: 3937 case GUEST_DS_BASE: 3938 case GUEST_FS_BASE: 3939 case GUEST_GS_BASE: 3940 case GUEST_LDTR_BASE: 3941 case GUEST_TR_BASE: 3942 case GUEST_GDTR_BASE: 3943 case GUEST_IDTR_BASE: 3944 case GUEST_PENDING_DBG_EXCEPTIONS: 3945 case GUEST_BNDCFGS: 3946 return true; 3947 default: 3948 break; 3949 } 3950 3951 return false; 3952 } 3953 3954 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3955 struct vmcs12 *vmcs12) 3956 { 3957 struct vcpu_vmx *vmx = to_vmx(vcpu); 3958 3959 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3960 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3961 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3962 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3963 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3964 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3965 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 3966 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 3967 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 3968 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 3969 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 3970 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 3971 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 3972 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 3973 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 3974 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 3975 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3976 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3977 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3978 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 3979 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 3980 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 3981 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 3982 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 3983 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 3984 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 3985 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 3986 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 3987 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 3988 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 3989 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 3990 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 3991 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 3992 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 3993 vmcs12->guest_pending_dbg_exceptions = 3994 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 3995 if (kvm_mpx_supported()) 3996 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3997 3998 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 3999 } 4000 4001 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4002 struct vmcs12 *vmcs12) 4003 { 4004 struct vcpu_vmx *vmx = to_vmx(vcpu); 4005 int cpu; 4006 4007 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4008 return; 4009 4010 4011 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4012 4013 cpu = get_cpu(); 4014 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4015 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4016 4017 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4018 4019 vmx->loaded_vmcs = &vmx->vmcs01; 4020 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4021 put_cpu(); 4022 } 4023 4024 /* 4025 * Update the guest state fields of vmcs12 to reflect changes that 4026 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4027 * VM-entry controls is also updated, since this is really a guest 4028 * state bit.) 4029 */ 4030 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4031 { 4032 struct vcpu_vmx *vmx = to_vmx(vcpu); 4033 4034 if (vmx->nested.hv_evmcs) 4035 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4036 4037 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; 4038 4039 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4040 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4041 4042 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4043 vmcs12->guest_rip = kvm_rip_read(vcpu); 4044 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4045 4046 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4047 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4048 4049 vmcs12->guest_interruptibility_info = 4050 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4051 4052 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4053 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4054 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4055 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4056 else 4057 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4058 4059 if (nested_cpu_has_preemption_timer(vmcs12) && 4060 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4061 !vmx->nested.nested_run_pending) 4062 vmcs12->vmx_preemption_timer_value = 4063 vmx_get_preemption_timer_value(vcpu); 4064 4065 /* 4066 * In some cases (usually, nested EPT), L2 is allowed to change its 4067 * own CR3 without exiting. If it has changed it, we must keep it. 4068 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4069 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4070 * 4071 * Additionally, restore L2's PDPTR to vmcs12. 4072 */ 4073 if (enable_ept) { 4074 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4075 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4076 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4077 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4078 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4079 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4080 } 4081 } 4082 4083 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4084 4085 if (nested_cpu_has_vid(vmcs12)) 4086 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4087 4088 vmcs12->vm_entry_controls = 4089 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4090 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4091 4092 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4093 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4094 4095 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4096 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4097 } 4098 4099 /* 4100 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4101 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4102 * and this function updates it to reflect the changes to the guest state while 4103 * L2 was running (and perhaps made some exits which were handled directly by L0 4104 * without going back to L1), and to reflect the exit reason. 4105 * Note that we do not have to copy here all VMCS fields, just those that 4106 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4107 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4108 * which already writes to vmcs12 directly. 4109 */ 4110 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4111 u32 vm_exit_reason, u32 exit_intr_info, 4112 unsigned long exit_qualification) 4113 { 4114 /* update exit information fields: */ 4115 vmcs12->vm_exit_reason = vm_exit_reason; 4116 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4117 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4118 vmcs12->exit_qualification = exit_qualification; 4119 vmcs12->vm_exit_intr_info = exit_intr_info; 4120 4121 vmcs12->idt_vectoring_info_field = 0; 4122 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4123 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4124 4125 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4126 vmcs12->launch_state = 1; 4127 4128 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4129 * instead of reading the real value. */ 4130 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4131 4132 /* 4133 * Transfer the event that L0 or L1 may wanted to inject into 4134 * L2 to IDT_VECTORING_INFO_FIELD. 4135 */ 4136 vmcs12_save_pending_event(vcpu, vmcs12); 4137 4138 /* 4139 * According to spec, there's no need to store the guest's 4140 * MSRs if the exit is due to a VM-entry failure that occurs 4141 * during or after loading the guest state. Since this exit 4142 * does not fall in that category, we need to save the MSRs. 4143 */ 4144 if (nested_vmx_store_msr(vcpu, 4145 vmcs12->vm_exit_msr_store_addr, 4146 vmcs12->vm_exit_msr_store_count)) 4147 nested_vmx_abort(vcpu, 4148 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4149 } 4150 4151 /* 4152 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 4153 * preserved above and would only end up incorrectly in L1. 4154 */ 4155 vcpu->arch.nmi_injected = false; 4156 kvm_clear_exception_queue(vcpu); 4157 kvm_clear_interrupt_queue(vcpu); 4158 } 4159 4160 /* 4161 * A part of what we need to when the nested L2 guest exits and we want to 4162 * run its L1 parent, is to reset L1's guest state to the host state specified 4163 * in vmcs12. 4164 * This function is to be called not only on normal nested exit, but also on 4165 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4166 * Failures During or After Loading Guest State"). 4167 * This function should be called when the active VMCS is L1's (vmcs01). 4168 */ 4169 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4170 struct vmcs12 *vmcs12) 4171 { 4172 enum vm_entry_failure_code ignored; 4173 struct kvm_segment seg; 4174 4175 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4176 vcpu->arch.efer = vmcs12->host_ia32_efer; 4177 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4178 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4179 else 4180 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4181 vmx_set_efer(vcpu, vcpu->arch.efer); 4182 4183 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4184 kvm_rip_write(vcpu, vmcs12->host_rip); 4185 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4186 vmx_set_interrupt_shadow(vcpu, 0); 4187 4188 /* 4189 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4190 * actually changed, because vmx_set_cr0 refers to efer set above. 4191 * 4192 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4193 * (KVM doesn't change it); 4194 */ 4195 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4196 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4197 4198 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4199 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4200 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4201 4202 nested_ept_uninit_mmu_context(vcpu); 4203 4204 /* 4205 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4206 * couldn't have changed. 4207 */ 4208 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) 4209 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4210 4211 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4212 4213 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4214 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4215 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4216 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4217 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4218 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4219 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4220 4221 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4222 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4223 vmcs_write64(GUEST_BNDCFGS, 0); 4224 4225 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4226 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4227 vcpu->arch.pat = vmcs12->host_ia32_pat; 4228 } 4229 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 4230 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4231 vmcs12->host_ia32_perf_global_ctrl)); 4232 4233 /* Set L1 segment info according to Intel SDM 4234 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4235 seg = (struct kvm_segment) { 4236 .base = 0, 4237 .limit = 0xFFFFFFFF, 4238 .selector = vmcs12->host_cs_selector, 4239 .type = 11, 4240 .present = 1, 4241 .s = 1, 4242 .g = 1 4243 }; 4244 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4245 seg.l = 1; 4246 else 4247 seg.db = 1; 4248 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4249 seg = (struct kvm_segment) { 4250 .base = 0, 4251 .limit = 0xFFFFFFFF, 4252 .type = 3, 4253 .present = 1, 4254 .s = 1, 4255 .db = 1, 4256 .g = 1 4257 }; 4258 seg.selector = vmcs12->host_ds_selector; 4259 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4260 seg.selector = vmcs12->host_es_selector; 4261 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4262 seg.selector = vmcs12->host_ss_selector; 4263 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4264 seg.selector = vmcs12->host_fs_selector; 4265 seg.base = vmcs12->host_fs_base; 4266 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4267 seg.selector = vmcs12->host_gs_selector; 4268 seg.base = vmcs12->host_gs_base; 4269 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4270 seg = (struct kvm_segment) { 4271 .base = vmcs12->host_tr_base, 4272 .limit = 0x67, 4273 .selector = vmcs12->host_tr_selector, 4274 .type = 11, 4275 .present = 1 4276 }; 4277 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4278 4279 kvm_set_dr(vcpu, 7, 0x400); 4280 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4281 4282 if (cpu_has_vmx_msr_bitmap()) 4283 vmx_update_msr_bitmap(vcpu); 4284 4285 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4286 vmcs12->vm_exit_msr_load_count)) 4287 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4288 } 4289 4290 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4291 { 4292 struct vmx_uret_msr *efer_msr; 4293 unsigned int i; 4294 4295 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4296 return vmcs_read64(GUEST_IA32_EFER); 4297 4298 if (cpu_has_load_ia32_efer()) 4299 return host_efer; 4300 4301 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4302 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4303 return vmx->msr_autoload.guest.val[i].value; 4304 } 4305 4306 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4307 if (efer_msr) 4308 return efer_msr->data; 4309 4310 return host_efer; 4311 } 4312 4313 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4314 { 4315 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4316 struct vcpu_vmx *vmx = to_vmx(vcpu); 4317 struct vmx_msr_entry g, h; 4318 gpa_t gpa; 4319 u32 i, j; 4320 4321 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4322 4323 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4324 /* 4325 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4326 * as vmcs01.GUEST_DR7 contains a userspace defined value 4327 * and vcpu->arch.dr7 is not squirreled away before the 4328 * nested VMENTER (not worth adding a variable in nested_vmx). 4329 */ 4330 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4331 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4332 else 4333 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4334 } 4335 4336 /* 4337 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4338 * handle a variety of side effects to KVM's software model. 4339 */ 4340 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4341 4342 vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4343 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4344 4345 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4346 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4347 4348 nested_ept_uninit_mmu_context(vcpu); 4349 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4350 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4351 4352 /* 4353 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4354 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4355 * VMFail, like everything else we just need to ensure our 4356 * software model is up-to-date. 4357 */ 4358 if (enable_ept && is_pae_paging(vcpu)) 4359 ept_save_pdptrs(vcpu); 4360 4361 kvm_mmu_reset_context(vcpu); 4362 4363 if (cpu_has_vmx_msr_bitmap()) 4364 vmx_update_msr_bitmap(vcpu); 4365 4366 /* 4367 * This nasty bit of open coding is a compromise between blindly 4368 * loading L1's MSRs using the exit load lists (incorrect emulation 4369 * of VMFail), leaving the nested VM's MSRs in the software model 4370 * (incorrect behavior) and snapshotting the modified MSRs (too 4371 * expensive since the lists are unbound by hardware). For each 4372 * MSR that was (prematurely) loaded from the nested VMEntry load 4373 * list, reload it from the exit load list if it exists and differs 4374 * from the guest value. The intent is to stuff host state as 4375 * silently as possible, not to fully process the exit load list. 4376 */ 4377 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4378 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4379 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4380 pr_debug_ratelimited( 4381 "%s read MSR index failed (%u, 0x%08llx)\n", 4382 __func__, i, gpa); 4383 goto vmabort; 4384 } 4385 4386 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4387 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4388 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4389 pr_debug_ratelimited( 4390 "%s read MSR failed (%u, 0x%08llx)\n", 4391 __func__, j, gpa); 4392 goto vmabort; 4393 } 4394 if (h.index != g.index) 4395 continue; 4396 if (h.value == g.value) 4397 break; 4398 4399 if (nested_vmx_load_msr_check(vcpu, &h)) { 4400 pr_debug_ratelimited( 4401 "%s check failed (%u, 0x%x, 0x%x)\n", 4402 __func__, j, h.index, h.reserved); 4403 goto vmabort; 4404 } 4405 4406 if (kvm_set_msr(vcpu, h.index, h.value)) { 4407 pr_debug_ratelimited( 4408 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4409 __func__, j, h.index, h.value); 4410 goto vmabort; 4411 } 4412 } 4413 } 4414 4415 return; 4416 4417 vmabort: 4418 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4419 } 4420 4421 /* 4422 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4423 * and modify vmcs12 to make it see what it would expect to see there if 4424 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4425 */ 4426 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4427 u32 exit_intr_info, unsigned long exit_qualification) 4428 { 4429 struct vcpu_vmx *vmx = to_vmx(vcpu); 4430 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4431 4432 /* trying to cancel vmlaunch/vmresume is a bug */ 4433 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4434 4435 /* Similarly, triple faults in L2 should never escape. */ 4436 WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)); 4437 4438 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 4439 4440 /* Service the TLB flush request for L2 before switching to L1. */ 4441 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 4442 kvm_vcpu_flush_tlb_current(vcpu); 4443 4444 /* 4445 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4446 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4447 * up-to-date before switching to L1. 4448 */ 4449 if (enable_ept && is_pae_paging(vcpu)) 4450 vmx_ept_load_pdptrs(vcpu); 4451 4452 leave_guest_mode(vcpu); 4453 4454 if (nested_cpu_has_preemption_timer(vmcs12)) 4455 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4456 4457 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 4458 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4459 4460 if (likely(!vmx->fail)) { 4461 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4462 4463 if (vm_exit_reason != -1) 4464 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4465 exit_intr_info, exit_qualification); 4466 4467 /* 4468 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4469 * also be used to capture vmcs12 cache as part of 4470 * capturing nVMX state for snapshot (migration). 4471 * 4472 * Otherwise, this flush will dirty guest memory at a 4473 * point it is already assumed by user-space to be 4474 * immutable. 4475 */ 4476 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4477 } else { 4478 /* 4479 * The only expected VM-instruction error is "VM entry with 4480 * invalid control field(s)." Anything else indicates a 4481 * problem with L0. And we should never get here with a 4482 * VMFail of any type if early consistency checks are enabled. 4483 */ 4484 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4485 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4486 WARN_ON_ONCE(nested_early_check); 4487 } 4488 4489 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4490 4491 /* Update any VMCS fields that might have changed while L2 ran */ 4492 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4493 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4494 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4495 if (vmx->nested.l1_tpr_threshold != -1) 4496 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4497 4498 if (kvm_has_tsc_control) 4499 decache_tsc_multiplier(vmx); 4500 4501 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4502 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4503 vmx_set_virtual_apic_mode(vcpu); 4504 } 4505 4506 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4507 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4508 vmx_update_cpu_dirty_logging(vcpu); 4509 } 4510 4511 /* Unpin physical memory we referred to in vmcs02 */ 4512 if (vmx->nested.apic_access_page) { 4513 kvm_release_page_clean(vmx->nested.apic_access_page); 4514 vmx->nested.apic_access_page = NULL; 4515 } 4516 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4517 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4518 vmx->nested.pi_desc = NULL; 4519 4520 if (vmx->nested.reload_vmcs01_apic_access_page) { 4521 vmx->nested.reload_vmcs01_apic_access_page = false; 4522 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4523 } 4524 4525 if ((vm_exit_reason != -1) && 4526 (enable_shadow_vmcs || vmx->nested.hv_evmcs)) 4527 vmx->nested.need_vmcs12_to_shadow_sync = true; 4528 4529 /* in case we halted in L2 */ 4530 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4531 4532 if (likely(!vmx->fail)) { 4533 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4534 nested_exit_intr_ack_set(vcpu)) { 4535 int irq = kvm_cpu_get_interrupt(vcpu); 4536 WARN_ON(irq < 0); 4537 vmcs12->vm_exit_intr_info = irq | 4538 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4539 } 4540 4541 if (vm_exit_reason != -1) 4542 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4543 vmcs12->exit_qualification, 4544 vmcs12->idt_vectoring_info_field, 4545 vmcs12->vm_exit_intr_info, 4546 vmcs12->vm_exit_intr_error_code, 4547 KVM_ISA_VMX); 4548 4549 load_vmcs12_host_state(vcpu, vmcs12); 4550 4551 return; 4552 } 4553 4554 /* 4555 * After an early L2 VM-entry failure, we're now back 4556 * in L1 which thinks it just finished a VMLAUNCH or 4557 * VMRESUME instruction, so we need to set the failure 4558 * flag and the VM-instruction error field of the VMCS 4559 * accordingly, and skip the emulated instruction. 4560 */ 4561 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4562 4563 /* 4564 * Restore L1's host state to KVM's software model. We're here 4565 * because a consistency check was caught by hardware, which 4566 * means some amount of guest state has been propagated to KVM's 4567 * model and needs to be unwound to the host's state. 4568 */ 4569 nested_vmx_restore_host_state(vcpu); 4570 4571 vmx->fail = 0; 4572 } 4573 4574 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4575 { 4576 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4577 } 4578 4579 /* 4580 * Decode the memory-address operand of a vmx instruction, as recorded on an 4581 * exit caused by such an instruction (run by a guest hypervisor). 4582 * On success, returns 0. When the operand is invalid, returns 1 and throws 4583 * #UD, #GP, or #SS. 4584 */ 4585 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4586 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4587 { 4588 gva_t off; 4589 bool exn; 4590 struct kvm_segment s; 4591 4592 /* 4593 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4594 * Execution", on an exit, vmx_instruction_info holds most of the 4595 * addressing components of the operand. Only the displacement part 4596 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4597 * For how an actual address is calculated from all these components, 4598 * refer to Vol. 1, "Operand Addressing". 4599 */ 4600 int scaling = vmx_instruction_info & 3; 4601 int addr_size = (vmx_instruction_info >> 7) & 7; 4602 bool is_reg = vmx_instruction_info & (1u << 10); 4603 int seg_reg = (vmx_instruction_info >> 15) & 7; 4604 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4605 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4606 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4607 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4608 4609 if (is_reg) { 4610 kvm_queue_exception(vcpu, UD_VECTOR); 4611 return 1; 4612 } 4613 4614 /* Addr = segment_base + offset */ 4615 /* offset = base + [index * scale] + displacement */ 4616 off = exit_qualification; /* holds the displacement */ 4617 if (addr_size == 1) 4618 off = (gva_t)sign_extend64(off, 31); 4619 else if (addr_size == 0) 4620 off = (gva_t)sign_extend64(off, 15); 4621 if (base_is_valid) 4622 off += kvm_register_read(vcpu, base_reg); 4623 if (index_is_valid) 4624 off += kvm_register_read(vcpu, index_reg) << scaling; 4625 vmx_get_segment(vcpu, &s, seg_reg); 4626 4627 /* 4628 * The effective address, i.e. @off, of a memory operand is truncated 4629 * based on the address size of the instruction. Note that this is 4630 * the *effective address*, i.e. the address prior to accounting for 4631 * the segment's base. 4632 */ 4633 if (addr_size == 1) /* 32 bit */ 4634 off &= 0xffffffff; 4635 else if (addr_size == 0) /* 16 bit */ 4636 off &= 0xffff; 4637 4638 /* Checks for #GP/#SS exceptions. */ 4639 exn = false; 4640 if (is_long_mode(vcpu)) { 4641 /* 4642 * The virtual/linear address is never truncated in 64-bit 4643 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4644 * address when using FS/GS with a non-zero base. 4645 */ 4646 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4647 *ret = s.base + off; 4648 else 4649 *ret = off; 4650 4651 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4652 * non-canonical form. This is the only check on the memory 4653 * destination for long mode! 4654 */ 4655 exn = is_noncanonical_address(*ret, vcpu); 4656 } else { 4657 /* 4658 * When not in long mode, the virtual/linear address is 4659 * unconditionally truncated to 32 bits regardless of the 4660 * address size. 4661 */ 4662 *ret = (s.base + off) & 0xffffffff; 4663 4664 /* Protected mode: apply checks for segment validity in the 4665 * following order: 4666 * - segment type check (#GP(0) may be thrown) 4667 * - usability check (#GP(0)/#SS(0)) 4668 * - limit check (#GP(0)/#SS(0)) 4669 */ 4670 if (wr) 4671 /* #GP(0) if the destination operand is located in a 4672 * read-only data segment or any code segment. 4673 */ 4674 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4675 else 4676 /* #GP(0) if the source operand is located in an 4677 * execute-only code segment 4678 */ 4679 exn = ((s.type & 0xa) == 8); 4680 if (exn) { 4681 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4682 return 1; 4683 } 4684 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4685 */ 4686 exn = (s.unusable != 0); 4687 4688 /* 4689 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4690 * outside the segment limit. All CPUs that support VMX ignore 4691 * limit checks for flat segments, i.e. segments with base==0, 4692 * limit==0xffffffff and of type expand-up data or code. 4693 */ 4694 if (!(s.base == 0 && s.limit == 0xffffffff && 4695 ((s.type & 8) || !(s.type & 4)))) 4696 exn = exn || ((u64)off + len - 1 > s.limit); 4697 } 4698 if (exn) { 4699 kvm_queue_exception_e(vcpu, 4700 seg_reg == VCPU_SREG_SS ? 4701 SS_VECTOR : GP_VECTOR, 4702 0); 4703 return 1; 4704 } 4705 4706 return 0; 4707 } 4708 4709 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 4710 { 4711 struct vcpu_vmx *vmx; 4712 4713 if (!nested_vmx_allowed(vcpu)) 4714 return; 4715 4716 vmx = to_vmx(vcpu); 4717 if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { 4718 vmx->nested.msrs.entry_ctls_high |= 4719 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4720 vmx->nested.msrs.exit_ctls_high |= 4721 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4722 } else { 4723 vmx->nested.msrs.entry_ctls_high &= 4724 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4725 vmx->nested.msrs.exit_ctls_high &= 4726 ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4727 } 4728 } 4729 4730 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 4731 int *ret) 4732 { 4733 gva_t gva; 4734 struct x86_exception e; 4735 int r; 4736 4737 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 4738 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4739 sizeof(*vmpointer), &gva)) { 4740 *ret = 1; 4741 return -EINVAL; 4742 } 4743 4744 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 4745 if (r != X86EMUL_CONTINUE) { 4746 *ret = kvm_handle_memory_failure(vcpu, r, &e); 4747 return -EINVAL; 4748 } 4749 4750 return 0; 4751 } 4752 4753 /* 4754 * Allocate a shadow VMCS and associate it with the currently loaded 4755 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4756 * VMCS is also VMCLEARed, so that it is ready for use. 4757 */ 4758 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4759 { 4760 struct vcpu_vmx *vmx = to_vmx(vcpu); 4761 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4762 4763 /* 4764 * We should allocate a shadow vmcs for vmcs01 only when L1 4765 * executes VMXON and free it when L1 executes VMXOFF. 4766 * As it is invalid to execute VMXON twice, we shouldn't reach 4767 * here when vmcs01 already have an allocated shadow vmcs. 4768 */ 4769 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4770 4771 if (!loaded_vmcs->shadow_vmcs) { 4772 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4773 if (loaded_vmcs->shadow_vmcs) 4774 vmcs_clear(loaded_vmcs->shadow_vmcs); 4775 } 4776 return loaded_vmcs->shadow_vmcs; 4777 } 4778 4779 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4780 { 4781 struct vcpu_vmx *vmx = to_vmx(vcpu); 4782 int r; 4783 4784 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4785 if (r < 0) 4786 goto out_vmcs02; 4787 4788 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4789 if (!vmx->nested.cached_vmcs12) 4790 goto out_cached_vmcs12; 4791 4792 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4793 if (!vmx->nested.cached_shadow_vmcs12) 4794 goto out_cached_shadow_vmcs12; 4795 4796 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4797 goto out_shadow_vmcs; 4798 4799 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4800 HRTIMER_MODE_ABS_PINNED); 4801 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4802 4803 vmx->nested.vpid02 = allocate_vpid(); 4804 4805 vmx->nested.vmcs02_initialized = false; 4806 vmx->nested.vmxon = true; 4807 4808 if (vmx_pt_mode_is_host_guest()) { 4809 vmx->pt_desc.guest.ctl = 0; 4810 pt_update_intercept_for_msr(vcpu); 4811 } 4812 4813 return 0; 4814 4815 out_shadow_vmcs: 4816 kfree(vmx->nested.cached_shadow_vmcs12); 4817 4818 out_cached_shadow_vmcs12: 4819 kfree(vmx->nested.cached_vmcs12); 4820 4821 out_cached_vmcs12: 4822 free_loaded_vmcs(&vmx->nested.vmcs02); 4823 4824 out_vmcs02: 4825 return -ENOMEM; 4826 } 4827 4828 /* 4829 * Emulate the VMXON instruction. 4830 * Currently, we just remember that VMX is active, and do not save or even 4831 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4832 * do not currently need to store anything in that guest-allocated memory 4833 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4834 * argument is different from the VMXON pointer (which the spec says they do). 4835 */ 4836 static int handle_vmon(struct kvm_vcpu *vcpu) 4837 { 4838 int ret; 4839 gpa_t vmptr; 4840 uint32_t revision; 4841 struct vcpu_vmx *vmx = to_vmx(vcpu); 4842 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 4843 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 4844 4845 /* 4846 * The Intel VMX Instruction Reference lists a bunch of bits that are 4847 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4848 * 1 (see vmx_is_valid_cr4() for when we allow the guest to set this). 4849 * Otherwise, we should fail with #UD. But most faulting conditions 4850 * have already been checked by hardware, prior to the VM-exit for 4851 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4852 * that bit set to 1 in non-root mode. 4853 */ 4854 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4855 kvm_queue_exception(vcpu, UD_VECTOR); 4856 return 1; 4857 } 4858 4859 /* CPL=0 must be checked manually. */ 4860 if (vmx_get_cpl(vcpu)) { 4861 kvm_inject_gp(vcpu, 0); 4862 return 1; 4863 } 4864 4865 if (vmx->nested.vmxon) 4866 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4867 4868 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4869 != VMXON_NEEDED_FEATURES) { 4870 kvm_inject_gp(vcpu, 0); 4871 return 1; 4872 } 4873 4874 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 4875 return ret; 4876 4877 /* 4878 * SDM 3: 24.11.5 4879 * The first 4 bytes of VMXON region contain the supported 4880 * VMCS revision identifier 4881 * 4882 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4883 * which replaces physical address width with 32 4884 */ 4885 if (!page_address_valid(vcpu, vmptr)) 4886 return nested_vmx_failInvalid(vcpu); 4887 4888 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4889 revision != VMCS12_REVISION) 4890 return nested_vmx_failInvalid(vcpu); 4891 4892 vmx->nested.vmxon_ptr = vmptr; 4893 ret = enter_vmx_operation(vcpu); 4894 if (ret) 4895 return ret; 4896 4897 return nested_vmx_succeed(vcpu); 4898 } 4899 4900 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4901 { 4902 struct vcpu_vmx *vmx = to_vmx(vcpu); 4903 4904 if (vmx->nested.current_vmptr == -1ull) 4905 return; 4906 4907 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4908 4909 if (enable_shadow_vmcs) { 4910 /* copy to memory all shadowed fields in case 4911 they were modified */ 4912 copy_shadow_to_vmcs12(vmx); 4913 vmx_disable_shadow_vmcs(vmx); 4914 } 4915 vmx->nested.posted_intr_nv = -1; 4916 4917 /* Flush VMCS12 to guest memory */ 4918 kvm_vcpu_write_guest_page(vcpu, 4919 vmx->nested.current_vmptr >> PAGE_SHIFT, 4920 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4921 4922 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4923 4924 vmx->nested.current_vmptr = -1ull; 4925 } 4926 4927 /* Emulate the VMXOFF instruction */ 4928 static int handle_vmoff(struct kvm_vcpu *vcpu) 4929 { 4930 if (!nested_vmx_check_permission(vcpu)) 4931 return 1; 4932 4933 free_nested(vcpu); 4934 4935 /* Process a latched INIT during time CPU was in VMX operation */ 4936 kvm_make_request(KVM_REQ_EVENT, vcpu); 4937 4938 return nested_vmx_succeed(vcpu); 4939 } 4940 4941 /* Emulate the VMCLEAR instruction */ 4942 static int handle_vmclear(struct kvm_vcpu *vcpu) 4943 { 4944 struct vcpu_vmx *vmx = to_vmx(vcpu); 4945 u32 zero = 0; 4946 gpa_t vmptr; 4947 u64 evmcs_gpa; 4948 int r; 4949 4950 if (!nested_vmx_check_permission(vcpu)) 4951 return 1; 4952 4953 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 4954 return r; 4955 4956 if (!page_address_valid(vcpu, vmptr)) 4957 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 4958 4959 if (vmptr == vmx->nested.vmxon_ptr) 4960 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 4961 4962 /* 4963 * When Enlightened VMEntry is enabled on the calling CPU we treat 4964 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 4965 * way to distinguish it from VMCS12) and we must not corrupt it by 4966 * writing to the non-existent 'launch_state' field. The area doesn't 4967 * have to be the currently active EVMCS on the calling CPU and there's 4968 * nothing KVM has to do to transition it from 'active' to 'non-active' 4969 * state. It is possible that the area will stay mapped as 4970 * vmx->nested.hv_evmcs but this shouldn't be a problem. 4971 */ 4972 if (likely(!vmx->nested.enlightened_vmcs_enabled || 4973 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 4974 if (vmptr == vmx->nested.current_vmptr) 4975 nested_release_vmcs12(vcpu); 4976 4977 kvm_vcpu_write_guest(vcpu, 4978 vmptr + offsetof(struct vmcs12, 4979 launch_state), 4980 &zero, sizeof(zero)); 4981 } 4982 4983 return nested_vmx_succeed(vcpu); 4984 } 4985 4986 /* Emulate the VMLAUNCH instruction */ 4987 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 4988 { 4989 return nested_vmx_run(vcpu, true); 4990 } 4991 4992 /* Emulate the VMRESUME instruction */ 4993 static int handle_vmresume(struct kvm_vcpu *vcpu) 4994 { 4995 4996 return nested_vmx_run(vcpu, false); 4997 } 4998 4999 static int handle_vmread(struct kvm_vcpu *vcpu) 5000 { 5001 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5002 : get_vmcs12(vcpu); 5003 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5004 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5005 struct vcpu_vmx *vmx = to_vmx(vcpu); 5006 struct x86_exception e; 5007 unsigned long field; 5008 u64 value; 5009 gva_t gva = 0; 5010 short offset; 5011 int len, r; 5012 5013 if (!nested_vmx_check_permission(vcpu)) 5014 return 1; 5015 5016 /* 5017 * In VMX non-root operation, when the VMCS-link pointer is -1ull, 5018 * any VMREAD sets the ALU flags for VMfailInvalid. 5019 */ 5020 if (vmx->nested.current_vmptr == -1ull || 5021 (is_guest_mode(vcpu) && 5022 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 5023 return nested_vmx_failInvalid(vcpu); 5024 5025 /* Decode instruction info and find the field to read */ 5026 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5027 5028 offset = vmcs_field_to_offset(field); 5029 if (offset < 0) 5030 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5031 5032 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5033 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5034 5035 /* Read the field, zero-extended to a u64 value */ 5036 value = vmcs12_read_any(vmcs12, field, offset); 5037 5038 /* 5039 * Now copy part of this value to register or memory, as requested. 5040 * Note that the number of bits actually copied is 32 or 64 depending 5041 * on the guest's mode (32 or 64 bit), not on the given field's length. 5042 */ 5043 if (instr_info & BIT(10)) { 5044 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5045 } else { 5046 len = is_64_bit_mode(vcpu) ? 8 : 4; 5047 if (get_vmx_mem_address(vcpu, exit_qualification, 5048 instr_info, true, len, &gva)) 5049 return 1; 5050 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5051 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5052 if (r != X86EMUL_CONTINUE) 5053 return kvm_handle_memory_failure(vcpu, r, &e); 5054 } 5055 5056 return nested_vmx_succeed(vcpu); 5057 } 5058 5059 static bool is_shadow_field_rw(unsigned long field) 5060 { 5061 switch (field) { 5062 #define SHADOW_FIELD_RW(x, y) case x: 5063 #include "vmcs_shadow_fields.h" 5064 return true; 5065 default: 5066 break; 5067 } 5068 return false; 5069 } 5070 5071 static bool is_shadow_field_ro(unsigned long field) 5072 { 5073 switch (field) { 5074 #define SHADOW_FIELD_RO(x, y) case x: 5075 #include "vmcs_shadow_fields.h" 5076 return true; 5077 default: 5078 break; 5079 } 5080 return false; 5081 } 5082 5083 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5084 { 5085 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5086 : get_vmcs12(vcpu); 5087 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5088 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5089 struct vcpu_vmx *vmx = to_vmx(vcpu); 5090 struct x86_exception e; 5091 unsigned long field; 5092 short offset; 5093 gva_t gva; 5094 int len, r; 5095 5096 /* 5097 * The value to write might be 32 or 64 bits, depending on L1's long 5098 * mode, and eventually we need to write that into a field of several 5099 * possible lengths. The code below first zero-extends the value to 64 5100 * bit (value), and then copies only the appropriate number of 5101 * bits into the vmcs12 field. 5102 */ 5103 u64 value = 0; 5104 5105 if (!nested_vmx_check_permission(vcpu)) 5106 return 1; 5107 5108 /* 5109 * In VMX non-root operation, when the VMCS-link pointer is -1ull, 5110 * any VMWRITE sets the ALU flags for VMfailInvalid. 5111 */ 5112 if (vmx->nested.current_vmptr == -1ull || 5113 (is_guest_mode(vcpu) && 5114 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 5115 return nested_vmx_failInvalid(vcpu); 5116 5117 if (instr_info & BIT(10)) 5118 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5119 else { 5120 len = is_64_bit_mode(vcpu) ? 8 : 4; 5121 if (get_vmx_mem_address(vcpu, exit_qualification, 5122 instr_info, false, len, &gva)) 5123 return 1; 5124 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5125 if (r != X86EMUL_CONTINUE) 5126 return kvm_handle_memory_failure(vcpu, r, &e); 5127 } 5128 5129 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5130 5131 offset = vmcs_field_to_offset(field); 5132 if (offset < 0) 5133 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5134 5135 /* 5136 * If the vCPU supports "VMWRITE to any supported field in the 5137 * VMCS," then the "read-only" fields are actually read/write. 5138 */ 5139 if (vmcs_field_readonly(field) && 5140 !nested_cpu_has_vmwrite_any_field(vcpu)) 5141 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5142 5143 /* 5144 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5145 * vmcs12, else we may crush a field or consume a stale value. 5146 */ 5147 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5148 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5149 5150 /* 5151 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5152 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5153 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5154 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5155 * from L1 will return a different value than VMREAD from L2 (L1 sees 5156 * the stripped down value, L2 sees the full value as stored by KVM). 5157 */ 5158 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5159 value &= 0x1f0ff; 5160 5161 vmcs12_write_any(vmcs12, field, offset, value); 5162 5163 /* 5164 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5165 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5166 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5167 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5168 */ 5169 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5170 /* 5171 * L1 can read these fields without exiting, ensure the 5172 * shadow VMCS is up-to-date. 5173 */ 5174 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5175 preempt_disable(); 5176 vmcs_load(vmx->vmcs01.shadow_vmcs); 5177 5178 __vmcs_writel(field, value); 5179 5180 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5181 vmcs_load(vmx->loaded_vmcs->vmcs); 5182 preempt_enable(); 5183 } 5184 vmx->nested.dirty_vmcs12 = true; 5185 } 5186 5187 return nested_vmx_succeed(vcpu); 5188 } 5189 5190 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5191 { 5192 vmx->nested.current_vmptr = vmptr; 5193 if (enable_shadow_vmcs) { 5194 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5195 vmcs_write64(VMCS_LINK_POINTER, 5196 __pa(vmx->vmcs01.shadow_vmcs)); 5197 vmx->nested.need_vmcs12_to_shadow_sync = true; 5198 } 5199 vmx->nested.dirty_vmcs12 = true; 5200 } 5201 5202 /* Emulate the VMPTRLD instruction */ 5203 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5204 { 5205 struct vcpu_vmx *vmx = to_vmx(vcpu); 5206 gpa_t vmptr; 5207 int r; 5208 5209 if (!nested_vmx_check_permission(vcpu)) 5210 return 1; 5211 5212 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5213 return r; 5214 5215 if (!page_address_valid(vcpu, vmptr)) 5216 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5217 5218 if (vmptr == vmx->nested.vmxon_ptr) 5219 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5220 5221 /* Forbid normal VMPTRLD if Enlightened version was used */ 5222 if (vmx->nested.hv_evmcs) 5223 return 1; 5224 5225 if (vmx->nested.current_vmptr != vmptr) { 5226 struct kvm_host_map map; 5227 struct vmcs12 *new_vmcs12; 5228 5229 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 5230 /* 5231 * Reads from an unbacked page return all 1s, 5232 * which means that the 32 bits located at the 5233 * given physical address won't match the required 5234 * VMCS12_REVISION identifier. 5235 */ 5236 return nested_vmx_fail(vcpu, 5237 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5238 } 5239 5240 new_vmcs12 = map.hva; 5241 5242 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 5243 (new_vmcs12->hdr.shadow_vmcs && 5244 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5245 kvm_vcpu_unmap(vcpu, &map, false); 5246 return nested_vmx_fail(vcpu, 5247 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5248 } 5249 5250 nested_release_vmcs12(vcpu); 5251 5252 /* 5253 * Load VMCS12 from guest memory since it is not already 5254 * cached. 5255 */ 5256 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 5257 kvm_vcpu_unmap(vcpu, &map, false); 5258 5259 set_current_vmptr(vmx, vmptr); 5260 } 5261 5262 return nested_vmx_succeed(vcpu); 5263 } 5264 5265 /* Emulate the VMPTRST instruction */ 5266 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5267 { 5268 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5269 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5270 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5271 struct x86_exception e; 5272 gva_t gva; 5273 int r; 5274 5275 if (!nested_vmx_check_permission(vcpu)) 5276 return 1; 5277 5278 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) 5279 return 1; 5280 5281 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5282 true, sizeof(gpa_t), &gva)) 5283 return 1; 5284 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5285 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5286 sizeof(gpa_t), &e); 5287 if (r != X86EMUL_CONTINUE) 5288 return kvm_handle_memory_failure(vcpu, r, &e); 5289 5290 return nested_vmx_succeed(vcpu); 5291 } 5292 5293 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 5294 5295 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 5296 { 5297 return VALID_PAGE(root_hpa) && 5298 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 5299 } 5300 5301 /* Emulate the INVEPT instruction */ 5302 static int handle_invept(struct kvm_vcpu *vcpu) 5303 { 5304 struct vcpu_vmx *vmx = to_vmx(vcpu); 5305 u32 vmx_instruction_info, types; 5306 unsigned long type, roots_to_free; 5307 struct kvm_mmu *mmu; 5308 gva_t gva; 5309 struct x86_exception e; 5310 struct { 5311 u64 eptp, gpa; 5312 } operand; 5313 int i, r; 5314 5315 if (!(vmx->nested.msrs.secondary_ctls_high & 5316 SECONDARY_EXEC_ENABLE_EPT) || 5317 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5318 kvm_queue_exception(vcpu, UD_VECTOR); 5319 return 1; 5320 } 5321 5322 if (!nested_vmx_check_permission(vcpu)) 5323 return 1; 5324 5325 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5326 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); 5327 5328 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5329 5330 if (type >= 32 || !(types & (1 << type))) 5331 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5332 5333 /* According to the Intel VMX instruction reference, the memory 5334 * operand is read even if it isn't needed (e.g., for type==global) 5335 */ 5336 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5337 vmx_instruction_info, false, sizeof(operand), &gva)) 5338 return 1; 5339 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5340 if (r != X86EMUL_CONTINUE) 5341 return kvm_handle_memory_failure(vcpu, r, &e); 5342 5343 /* 5344 * Nested EPT roots are always held through guest_mmu, 5345 * not root_mmu. 5346 */ 5347 mmu = &vcpu->arch.guest_mmu; 5348 5349 switch (type) { 5350 case VMX_EPT_EXTENT_CONTEXT: 5351 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5352 return nested_vmx_fail(vcpu, 5353 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5354 5355 roots_to_free = 0; 5356 if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, 5357 operand.eptp)) 5358 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5359 5360 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5361 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5362 mmu->prev_roots[i].pgd, 5363 operand.eptp)) 5364 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5365 } 5366 break; 5367 case VMX_EPT_EXTENT_GLOBAL: 5368 roots_to_free = KVM_MMU_ROOTS_ALL; 5369 break; 5370 default: 5371 BUG(); 5372 break; 5373 } 5374 5375 if (roots_to_free) 5376 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 5377 5378 return nested_vmx_succeed(vcpu); 5379 } 5380 5381 static int handle_invvpid(struct kvm_vcpu *vcpu) 5382 { 5383 struct vcpu_vmx *vmx = to_vmx(vcpu); 5384 u32 vmx_instruction_info; 5385 unsigned long type, types; 5386 gva_t gva; 5387 struct x86_exception e; 5388 struct { 5389 u64 vpid; 5390 u64 gla; 5391 } operand; 5392 u16 vpid02; 5393 int r; 5394 5395 if (!(vmx->nested.msrs.secondary_ctls_high & 5396 SECONDARY_EXEC_ENABLE_VPID) || 5397 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5398 kvm_queue_exception(vcpu, UD_VECTOR); 5399 return 1; 5400 } 5401 5402 if (!nested_vmx_check_permission(vcpu)) 5403 return 1; 5404 5405 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5406 type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); 5407 5408 types = (vmx->nested.msrs.vpid_caps & 5409 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5410 5411 if (type >= 32 || !(types & (1 << type))) 5412 return nested_vmx_fail(vcpu, 5413 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5414 5415 /* according to the intel vmx instruction reference, the memory 5416 * operand is read even if it isn't needed (e.g., for type==global) 5417 */ 5418 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5419 vmx_instruction_info, false, sizeof(operand), &gva)) 5420 return 1; 5421 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5422 if (r != X86EMUL_CONTINUE) 5423 return kvm_handle_memory_failure(vcpu, r, &e); 5424 5425 if (operand.vpid >> 16) 5426 return nested_vmx_fail(vcpu, 5427 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5428 5429 vpid02 = nested_get_vpid02(vcpu); 5430 switch (type) { 5431 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5432 if (!operand.vpid || 5433 is_noncanonical_address(operand.gla, vcpu)) 5434 return nested_vmx_fail(vcpu, 5435 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5436 vpid_sync_vcpu_addr(vpid02, operand.gla); 5437 break; 5438 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5439 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5440 if (!operand.vpid) 5441 return nested_vmx_fail(vcpu, 5442 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5443 vpid_sync_context(vpid02); 5444 break; 5445 case VMX_VPID_EXTENT_ALL_CONTEXT: 5446 vpid_sync_context(vpid02); 5447 break; 5448 default: 5449 WARN_ON_ONCE(1); 5450 return kvm_skip_emulated_instruction(vcpu); 5451 } 5452 5453 /* 5454 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5455 * linear mappings for L2 (tagged with L2's VPID). Free all roots as 5456 * VPIDs are not tracked in the MMU role. 5457 * 5458 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5459 * an MMU when EPT is disabled. 5460 * 5461 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5462 */ 5463 if (!enable_ept) 5464 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, 5465 KVM_MMU_ROOTS_ALL); 5466 5467 return nested_vmx_succeed(vcpu); 5468 } 5469 5470 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5471 struct vmcs12 *vmcs12) 5472 { 5473 u32 index = kvm_rcx_read(vcpu); 5474 u64 new_eptp; 5475 bool accessed_dirty; 5476 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 5477 5478 if (!nested_cpu_has_eptp_switching(vmcs12) || 5479 !nested_cpu_has_ept(vmcs12)) 5480 return 1; 5481 5482 if (index >= VMFUNC_EPTP_ENTRIES) 5483 return 1; 5484 5485 5486 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5487 &new_eptp, index * 8, 8)) 5488 return 1; 5489 5490 accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT); 5491 5492 /* 5493 * If the (L2) guest does a vmfunc to the currently 5494 * active ept pointer, we don't have to do anything else 5495 */ 5496 if (vmcs12->ept_pointer != new_eptp) { 5497 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5498 return 1; 5499 5500 mmu->ept_ad = accessed_dirty; 5501 mmu->mmu_role.base.ad_disabled = !accessed_dirty; 5502 vmcs12->ept_pointer = new_eptp; 5503 5504 kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); 5505 } 5506 5507 return 0; 5508 } 5509 5510 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5511 { 5512 struct vcpu_vmx *vmx = to_vmx(vcpu); 5513 struct vmcs12 *vmcs12; 5514 u32 function = kvm_rax_read(vcpu); 5515 5516 /* 5517 * VMFUNC is only supported for nested guests, but we always enable the 5518 * secondary control for simplicity; for non-nested mode, fake that we 5519 * didn't by injecting #UD. 5520 */ 5521 if (!is_guest_mode(vcpu)) { 5522 kvm_queue_exception(vcpu, UD_VECTOR); 5523 return 1; 5524 } 5525 5526 vmcs12 = get_vmcs12(vcpu); 5527 if ((vmcs12->vm_function_control & (1 << function)) == 0) 5528 goto fail; 5529 5530 switch (function) { 5531 case 0: 5532 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5533 goto fail; 5534 break; 5535 default: 5536 goto fail; 5537 } 5538 return kvm_skip_emulated_instruction(vcpu); 5539 5540 fail: 5541 /* 5542 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5543 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5544 * EXIT_REASON_VMFUNC as the exit reason. 5545 */ 5546 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5547 vmx_get_intr_info(vcpu), 5548 vmx_get_exit_qual(vcpu)); 5549 return 1; 5550 } 5551 5552 /* 5553 * Return true if an IO instruction with the specified port and size should cause 5554 * a VM-exit into L1. 5555 */ 5556 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5557 int size) 5558 { 5559 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5560 gpa_t bitmap, last_bitmap; 5561 u8 b; 5562 5563 last_bitmap = (gpa_t)-1; 5564 b = -1; 5565 5566 while (size > 0) { 5567 if (port < 0x8000) 5568 bitmap = vmcs12->io_bitmap_a; 5569 else if (port < 0x10000) 5570 bitmap = vmcs12->io_bitmap_b; 5571 else 5572 return true; 5573 bitmap += (port & 0x7fff) / 8; 5574 5575 if (last_bitmap != bitmap) 5576 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5577 return true; 5578 if (b & (1 << (port & 7))) 5579 return true; 5580 5581 port++; 5582 size--; 5583 last_bitmap = bitmap; 5584 } 5585 5586 return false; 5587 } 5588 5589 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5590 struct vmcs12 *vmcs12) 5591 { 5592 unsigned long exit_qualification; 5593 unsigned short port; 5594 int size; 5595 5596 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5597 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5598 5599 exit_qualification = vmx_get_exit_qual(vcpu); 5600 5601 port = exit_qualification >> 16; 5602 size = (exit_qualification & 7) + 1; 5603 5604 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5605 } 5606 5607 /* 5608 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5609 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5610 * disinterest in the current event (read or write a specific MSR) by using an 5611 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5612 */ 5613 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5614 struct vmcs12 *vmcs12, 5615 union vmx_exit_reason exit_reason) 5616 { 5617 u32 msr_index = kvm_rcx_read(vcpu); 5618 gpa_t bitmap; 5619 5620 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5621 return true; 5622 5623 /* 5624 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5625 * for the four combinations of read/write and low/high MSR numbers. 5626 * First we need to figure out which of the four to use: 5627 */ 5628 bitmap = vmcs12->msr_bitmap; 5629 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 5630 bitmap += 2048; 5631 if (msr_index >= 0xc0000000) { 5632 msr_index -= 0xc0000000; 5633 bitmap += 1024; 5634 } 5635 5636 /* Then read the msr_index'th bit from this bitmap: */ 5637 if (msr_index < 1024*8) { 5638 unsigned char b; 5639 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5640 return true; 5641 return 1 & (b >> (msr_index & 7)); 5642 } else 5643 return true; /* let L1 handle the wrong parameter */ 5644 } 5645 5646 /* 5647 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5648 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5649 * intercept (via guest_host_mask etc.) the current event. 5650 */ 5651 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5652 struct vmcs12 *vmcs12) 5653 { 5654 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5655 int cr = exit_qualification & 15; 5656 int reg; 5657 unsigned long val; 5658 5659 switch ((exit_qualification >> 4) & 3) { 5660 case 0: /* mov to cr */ 5661 reg = (exit_qualification >> 8) & 15; 5662 val = kvm_register_read(vcpu, reg); 5663 switch (cr) { 5664 case 0: 5665 if (vmcs12->cr0_guest_host_mask & 5666 (val ^ vmcs12->cr0_read_shadow)) 5667 return true; 5668 break; 5669 case 3: 5670 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5671 return true; 5672 break; 5673 case 4: 5674 if (vmcs12->cr4_guest_host_mask & 5675 (vmcs12->cr4_read_shadow ^ val)) 5676 return true; 5677 break; 5678 case 8: 5679 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5680 return true; 5681 break; 5682 } 5683 break; 5684 case 2: /* clts */ 5685 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5686 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5687 return true; 5688 break; 5689 case 1: /* mov from cr */ 5690 switch (cr) { 5691 case 3: 5692 if (vmcs12->cpu_based_vm_exec_control & 5693 CPU_BASED_CR3_STORE_EXITING) 5694 return true; 5695 break; 5696 case 8: 5697 if (vmcs12->cpu_based_vm_exec_control & 5698 CPU_BASED_CR8_STORE_EXITING) 5699 return true; 5700 break; 5701 } 5702 break; 5703 case 3: /* lmsw */ 5704 /* 5705 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5706 * cr0. Other attempted changes are ignored, with no exit. 5707 */ 5708 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5709 if (vmcs12->cr0_guest_host_mask & 0xe & 5710 (val ^ vmcs12->cr0_read_shadow)) 5711 return true; 5712 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5713 !(vmcs12->cr0_read_shadow & 0x1) && 5714 (val & 0x1)) 5715 return true; 5716 break; 5717 } 5718 return false; 5719 } 5720 5721 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 5722 struct vmcs12 *vmcs12) 5723 { 5724 u32 encls_leaf; 5725 5726 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 5727 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 5728 return false; 5729 5730 encls_leaf = kvm_rax_read(vcpu); 5731 if (encls_leaf > 62) 5732 encls_leaf = 63; 5733 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 5734 } 5735 5736 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5737 struct vmcs12 *vmcs12, gpa_t bitmap) 5738 { 5739 u32 vmx_instruction_info; 5740 unsigned long field; 5741 u8 b; 5742 5743 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5744 return true; 5745 5746 /* Decode instruction info and find the field to access */ 5747 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5748 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5749 5750 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5751 if (field >> 15) 5752 return true; 5753 5754 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5755 return true; 5756 5757 return 1 & (b >> (field & 7)); 5758 } 5759 5760 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 5761 { 5762 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 5763 5764 if (nested_cpu_has_mtf(vmcs12)) 5765 return true; 5766 5767 /* 5768 * An MTF VM-exit may be injected into the guest by setting the 5769 * interruption-type to 7 (other event) and the vector field to 0. Such 5770 * is the case regardless of the 'monitor trap flag' VM-execution 5771 * control. 5772 */ 5773 return entry_intr_info == (INTR_INFO_VALID_MASK 5774 | INTR_TYPE_OTHER_EVENT); 5775 } 5776 5777 /* 5778 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 5779 * L1 wants the exit. Only call this when in is_guest_mode (L2). 5780 */ 5781 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 5782 union vmx_exit_reason exit_reason) 5783 { 5784 u32 intr_info; 5785 5786 switch ((u16)exit_reason.basic) { 5787 case EXIT_REASON_EXCEPTION_NMI: 5788 intr_info = vmx_get_intr_info(vcpu); 5789 if (is_nmi(intr_info)) 5790 return true; 5791 else if (is_page_fault(intr_info)) 5792 return vcpu->arch.apf.host_apf_flags || !enable_ept; 5793 else if (is_debug(intr_info) && 5794 vcpu->guest_debug & 5795 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5796 return true; 5797 else if (is_breakpoint(intr_info) && 5798 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5799 return true; 5800 return false; 5801 case EXIT_REASON_EXTERNAL_INTERRUPT: 5802 return true; 5803 case EXIT_REASON_MCE_DURING_VMENTRY: 5804 return true; 5805 case EXIT_REASON_EPT_VIOLATION: 5806 /* 5807 * L0 always deals with the EPT violation. If nested EPT is 5808 * used, and the nested mmu code discovers that the address is 5809 * missing in the guest EPT table (EPT12), the EPT violation 5810 * will be injected with nested_ept_inject_page_fault() 5811 */ 5812 return true; 5813 case EXIT_REASON_EPT_MISCONFIG: 5814 /* 5815 * L2 never uses directly L1's EPT, but rather L0's own EPT 5816 * table (shadow on EPT) or a merged EPT table that L0 built 5817 * (EPT on EPT). So any problems with the structure of the 5818 * table is L0's fault. 5819 */ 5820 return true; 5821 case EXIT_REASON_PREEMPTION_TIMER: 5822 return true; 5823 case EXIT_REASON_PML_FULL: 5824 /* 5825 * PML is emulated for an L1 VMM and should never be enabled in 5826 * vmcs02, always "handle" PML_FULL by exiting to userspace. 5827 */ 5828 return true; 5829 case EXIT_REASON_VMFUNC: 5830 /* VM functions are emulated through L2->L0 vmexits. */ 5831 return true; 5832 default: 5833 break; 5834 } 5835 return false; 5836 } 5837 5838 /* 5839 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 5840 * is_guest_mode (L2). 5841 */ 5842 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 5843 union vmx_exit_reason exit_reason) 5844 { 5845 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5846 u32 intr_info; 5847 5848 switch ((u16)exit_reason.basic) { 5849 case EXIT_REASON_EXCEPTION_NMI: 5850 intr_info = vmx_get_intr_info(vcpu); 5851 if (is_nmi(intr_info)) 5852 return true; 5853 else if (is_page_fault(intr_info)) 5854 return true; 5855 return vmcs12->exception_bitmap & 5856 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5857 case EXIT_REASON_EXTERNAL_INTERRUPT: 5858 return nested_exit_on_intr(vcpu); 5859 case EXIT_REASON_TRIPLE_FAULT: 5860 return true; 5861 case EXIT_REASON_INTERRUPT_WINDOW: 5862 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 5863 case EXIT_REASON_NMI_WINDOW: 5864 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 5865 case EXIT_REASON_TASK_SWITCH: 5866 return true; 5867 case EXIT_REASON_CPUID: 5868 return true; 5869 case EXIT_REASON_HLT: 5870 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5871 case EXIT_REASON_INVD: 5872 return true; 5873 case EXIT_REASON_INVLPG: 5874 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5875 case EXIT_REASON_RDPMC: 5876 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5877 case EXIT_REASON_RDRAND: 5878 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5879 case EXIT_REASON_RDSEED: 5880 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5881 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5882 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5883 case EXIT_REASON_VMREAD: 5884 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5885 vmcs12->vmread_bitmap); 5886 case EXIT_REASON_VMWRITE: 5887 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5888 vmcs12->vmwrite_bitmap); 5889 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5890 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5891 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5892 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5893 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5894 /* 5895 * VMX instructions trap unconditionally. This allows L1 to 5896 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5897 */ 5898 return true; 5899 case EXIT_REASON_CR_ACCESS: 5900 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5901 case EXIT_REASON_DR_ACCESS: 5902 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5903 case EXIT_REASON_IO_INSTRUCTION: 5904 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5905 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5906 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5907 case EXIT_REASON_MSR_READ: 5908 case EXIT_REASON_MSR_WRITE: 5909 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5910 case EXIT_REASON_INVALID_STATE: 5911 return true; 5912 case EXIT_REASON_MWAIT_INSTRUCTION: 5913 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5914 case EXIT_REASON_MONITOR_TRAP_FLAG: 5915 return nested_vmx_exit_handled_mtf(vmcs12); 5916 case EXIT_REASON_MONITOR_INSTRUCTION: 5917 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5918 case EXIT_REASON_PAUSE_INSTRUCTION: 5919 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5920 nested_cpu_has2(vmcs12, 5921 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5922 case EXIT_REASON_MCE_DURING_VMENTRY: 5923 return true; 5924 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5925 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5926 case EXIT_REASON_APIC_ACCESS: 5927 case EXIT_REASON_APIC_WRITE: 5928 case EXIT_REASON_EOI_INDUCED: 5929 /* 5930 * The controls for "virtualize APIC accesses," "APIC- 5931 * register virtualization," and "virtual-interrupt 5932 * delivery" only come from vmcs12. 5933 */ 5934 return true; 5935 case EXIT_REASON_INVPCID: 5936 return 5937 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5938 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5939 case EXIT_REASON_WBINVD: 5940 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5941 case EXIT_REASON_XSETBV: 5942 return true; 5943 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5944 /* 5945 * This should never happen, since it is not possible to 5946 * set XSS to a non-zero value---neither in L1 nor in L2. 5947 * If if it were, XSS would have to be checked against 5948 * the XSS exit bitmap in vmcs12. 5949 */ 5950 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5951 case EXIT_REASON_UMWAIT: 5952 case EXIT_REASON_TPAUSE: 5953 return nested_cpu_has2(vmcs12, 5954 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 5955 case EXIT_REASON_ENCLS: 5956 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 5957 default: 5958 return true; 5959 } 5960 } 5961 5962 /* 5963 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 5964 * reflected into L1. 5965 */ 5966 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 5967 { 5968 struct vcpu_vmx *vmx = to_vmx(vcpu); 5969 union vmx_exit_reason exit_reason = vmx->exit_reason; 5970 unsigned long exit_qual; 5971 u32 exit_intr_info; 5972 5973 WARN_ON_ONCE(vmx->nested.nested_run_pending); 5974 5975 /* 5976 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 5977 * has already loaded L2's state. 5978 */ 5979 if (unlikely(vmx->fail)) { 5980 trace_kvm_nested_vmenter_failed( 5981 "hardware VM-instruction error: ", 5982 vmcs_read32(VM_INSTRUCTION_ERROR)); 5983 exit_intr_info = 0; 5984 exit_qual = 0; 5985 goto reflect_vmexit; 5986 } 5987 5988 trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX); 5989 5990 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 5991 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 5992 return false; 5993 5994 /* If L1 doesn't want the exit, handle it in L0. */ 5995 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 5996 return false; 5997 5998 /* 5999 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6000 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6001 * need to be synthesized by querying the in-kernel LAPIC, but external 6002 * interrupts are never reflected to L1 so it's a non-issue. 6003 */ 6004 exit_intr_info = vmx_get_intr_info(vcpu); 6005 if (is_exception_with_error_code(exit_intr_info)) { 6006 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6007 6008 vmcs12->vm_exit_intr_error_code = 6009 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6010 } 6011 exit_qual = vmx_get_exit_qual(vcpu); 6012 6013 reflect_vmexit: 6014 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6015 return true; 6016 } 6017 6018 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6019 struct kvm_nested_state __user *user_kvm_nested_state, 6020 u32 user_data_size) 6021 { 6022 struct vcpu_vmx *vmx; 6023 struct vmcs12 *vmcs12; 6024 struct kvm_nested_state kvm_state = { 6025 .flags = 0, 6026 .format = KVM_STATE_NESTED_FORMAT_VMX, 6027 .size = sizeof(kvm_state), 6028 .hdr.vmx.flags = 0, 6029 .hdr.vmx.vmxon_pa = -1ull, 6030 .hdr.vmx.vmcs12_pa = -1ull, 6031 .hdr.vmx.preemption_timer_deadline = 0, 6032 }; 6033 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6034 &user_kvm_nested_state->data.vmx[0]; 6035 6036 if (!vcpu) 6037 return kvm_state.size + sizeof(*user_vmx_nested_state); 6038 6039 vmx = to_vmx(vcpu); 6040 vmcs12 = get_vmcs12(vcpu); 6041 6042 if (nested_vmx_allowed(vcpu) && 6043 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6044 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6045 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6046 6047 if (vmx_has_valid_vmcs12(vcpu)) { 6048 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6049 6050 if (vmx->nested.hv_evmcs) 6051 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6052 6053 if (is_guest_mode(vcpu) && 6054 nested_cpu_has_shadow_vmcs(vmcs12) && 6055 vmcs12->vmcs_link_pointer != -1ull) 6056 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6057 } 6058 6059 if (vmx->nested.smm.vmxon) 6060 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6061 6062 if (vmx->nested.smm.guest_mode) 6063 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6064 6065 if (is_guest_mode(vcpu)) { 6066 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6067 6068 if (vmx->nested.nested_run_pending) 6069 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6070 6071 if (vmx->nested.mtf_pending) 6072 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6073 6074 if (nested_cpu_has_preemption_timer(vmcs12) && 6075 vmx->nested.has_preemption_timer_deadline) { 6076 kvm_state.hdr.vmx.flags |= 6077 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6078 kvm_state.hdr.vmx.preemption_timer_deadline = 6079 vmx->nested.preemption_timer_deadline; 6080 } 6081 } 6082 } 6083 6084 if (user_data_size < kvm_state.size) 6085 goto out; 6086 6087 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6088 return -EFAULT; 6089 6090 if (!vmx_has_valid_vmcs12(vcpu)) 6091 goto out; 6092 6093 /* 6094 * When running L2, the authoritative vmcs12 state is in the 6095 * vmcs02. When running L1, the authoritative vmcs12 state is 6096 * in the shadow or enlightened vmcs linked to vmcs01, unless 6097 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6098 * vmcs12 state is in the vmcs12 already. 6099 */ 6100 if (is_guest_mode(vcpu)) { 6101 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6102 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6103 } else { 6104 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6105 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6106 if (vmx->nested.hv_evmcs) 6107 copy_enlightened_to_vmcs12(vmx); 6108 else if (enable_shadow_vmcs) 6109 copy_shadow_to_vmcs12(vmx); 6110 } 6111 } 6112 6113 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6114 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6115 6116 /* 6117 * Copy over the full allocated size of vmcs12 rather than just the size 6118 * of the struct. 6119 */ 6120 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6121 return -EFAULT; 6122 6123 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6124 vmcs12->vmcs_link_pointer != -1ull) { 6125 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6126 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6127 return -EFAULT; 6128 } 6129 out: 6130 return kvm_state.size; 6131 } 6132 6133 /* 6134 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 6135 */ 6136 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6137 { 6138 if (is_guest_mode(vcpu)) { 6139 to_vmx(vcpu)->nested.nested_run_pending = 0; 6140 nested_vmx_vmexit(vcpu, -1, 0, 0); 6141 } 6142 free_nested(vcpu); 6143 } 6144 6145 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6146 struct kvm_nested_state __user *user_kvm_nested_state, 6147 struct kvm_nested_state *kvm_state) 6148 { 6149 struct vcpu_vmx *vmx = to_vmx(vcpu); 6150 struct vmcs12 *vmcs12; 6151 enum vm_entry_failure_code ignored; 6152 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6153 &user_kvm_nested_state->data.vmx[0]; 6154 int ret; 6155 6156 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6157 return -EINVAL; 6158 6159 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { 6160 if (kvm_state->hdr.vmx.smm.flags) 6161 return -EINVAL; 6162 6163 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) 6164 return -EINVAL; 6165 6166 /* 6167 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6168 * enable eVMCS capability on vCPU. However, since then 6169 * code was changed such that flag signals vmcs12 should 6170 * be copied into eVMCS in guest memory. 6171 * 6172 * To preserve backwards compatability, allow user 6173 * to set this flag even when there is no VMXON region. 6174 */ 6175 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6176 return -EINVAL; 6177 } else { 6178 if (!nested_vmx_allowed(vcpu)) 6179 return -EINVAL; 6180 6181 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6182 return -EINVAL; 6183 } 6184 6185 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6186 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6187 return -EINVAL; 6188 6189 if (kvm_state->hdr.vmx.smm.flags & 6190 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6191 return -EINVAL; 6192 6193 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6194 return -EINVAL; 6195 6196 /* 6197 * SMM temporarily disables VMX, so we cannot be in guest mode, 6198 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6199 * must be zero. 6200 */ 6201 if (is_smm(vcpu) ? 6202 (kvm_state->flags & 6203 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6204 : kvm_state->hdr.vmx.smm.flags) 6205 return -EINVAL; 6206 6207 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6208 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6209 return -EINVAL; 6210 6211 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6212 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6213 return -EINVAL; 6214 6215 vmx_leave_nested(vcpu); 6216 6217 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) 6218 return 0; 6219 6220 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6221 ret = enter_vmx_operation(vcpu); 6222 if (ret) 6223 return ret; 6224 6225 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6226 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6227 /* See vmx_has_valid_vmcs12. */ 6228 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6229 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6230 (kvm_state->hdr.vmx.vmcs12_pa != -1ull)) 6231 return -EINVAL; 6232 else 6233 return 0; 6234 } 6235 6236 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { 6237 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6238 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6239 return -EINVAL; 6240 6241 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6242 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6243 /* 6244 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6245 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6246 * restored yet. EVMCS will be mapped from 6247 * nested_get_vmcs12_pages(). 6248 */ 6249 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6250 } else { 6251 return -EINVAL; 6252 } 6253 6254 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6255 vmx->nested.smm.vmxon = true; 6256 vmx->nested.vmxon = false; 6257 6258 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6259 vmx->nested.smm.guest_mode = true; 6260 } 6261 6262 vmcs12 = get_vmcs12(vcpu); 6263 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6264 return -EFAULT; 6265 6266 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6267 return -EINVAL; 6268 6269 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6270 return 0; 6271 6272 vmx->nested.nested_run_pending = 6273 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6274 6275 vmx->nested.mtf_pending = 6276 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6277 6278 ret = -EINVAL; 6279 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6280 vmcs12->vmcs_link_pointer != -1ull) { 6281 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6282 6283 if (kvm_state->size < 6284 sizeof(*kvm_state) + 6285 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6286 goto error_guest_mode; 6287 6288 if (copy_from_user(shadow_vmcs12, 6289 user_vmx_nested_state->shadow_vmcs12, 6290 sizeof(*shadow_vmcs12))) { 6291 ret = -EFAULT; 6292 goto error_guest_mode; 6293 } 6294 6295 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6296 !shadow_vmcs12->hdr.shadow_vmcs) 6297 goto error_guest_mode; 6298 } 6299 6300 vmx->nested.has_preemption_timer_deadline = false; 6301 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6302 vmx->nested.has_preemption_timer_deadline = true; 6303 vmx->nested.preemption_timer_deadline = 6304 kvm_state->hdr.vmx.preemption_timer_deadline; 6305 } 6306 6307 if (nested_vmx_check_controls(vcpu, vmcs12) || 6308 nested_vmx_check_host_state(vcpu, vmcs12) || 6309 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6310 goto error_guest_mode; 6311 6312 vmx->nested.dirty_vmcs12 = true; 6313 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6314 if (ret) 6315 goto error_guest_mode; 6316 6317 return 0; 6318 6319 error_guest_mode: 6320 vmx->nested.nested_run_pending = 0; 6321 return ret; 6322 } 6323 6324 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6325 { 6326 if (enable_shadow_vmcs) { 6327 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6328 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6329 } 6330 } 6331 6332 /* 6333 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6334 * returned for the various VMX controls MSRs when nested VMX is enabled. 6335 * The same values should also be used to verify that vmcs12 control fields are 6336 * valid during nested entry from L1 to L2. 6337 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6338 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6339 * bit in the high half is on if the corresponding bit in the control field 6340 * may be on. See also vmx_control_verify(). 6341 */ 6342 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) 6343 { 6344 /* 6345 * Note that as a general rule, the high half of the MSRs (bits in 6346 * the control fields which may be 1) should be initialized by the 6347 * intersection of the underlying hardware's MSR (i.e., features which 6348 * can be supported) and the list of features we want to expose - 6349 * because they are known to be properly supported in our code. 6350 * Also, usually, the low half of the MSRs (bits which must be 1) can 6351 * be set to 0, meaning that L1 may turn off any of these bits. The 6352 * reason is that if one of these bits is necessary, it will appear 6353 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6354 * fields of vmcs01 and vmcs02, will turn these bits off - and 6355 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6356 * These rules have exceptions below. 6357 */ 6358 6359 /* pin-based controls */ 6360 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 6361 msrs->pinbased_ctls_low, 6362 msrs->pinbased_ctls_high); 6363 msrs->pinbased_ctls_low |= 6364 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6365 msrs->pinbased_ctls_high &= 6366 PIN_BASED_EXT_INTR_MASK | 6367 PIN_BASED_NMI_EXITING | 6368 PIN_BASED_VIRTUAL_NMIS | 6369 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6370 msrs->pinbased_ctls_high |= 6371 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6372 PIN_BASED_VMX_PREEMPTION_TIMER; 6373 6374 /* exit controls */ 6375 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 6376 msrs->exit_ctls_low, 6377 msrs->exit_ctls_high); 6378 msrs->exit_ctls_low = 6379 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6380 6381 msrs->exit_ctls_high &= 6382 #ifdef CONFIG_X86_64 6383 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6384 #endif 6385 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6386 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6387 msrs->exit_ctls_high |= 6388 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6389 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6390 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 6391 6392 /* We support free control of debug control saving. */ 6393 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6394 6395 /* entry controls */ 6396 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 6397 msrs->entry_ctls_low, 6398 msrs->entry_ctls_high); 6399 msrs->entry_ctls_low = 6400 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6401 msrs->entry_ctls_high &= 6402 #ifdef CONFIG_X86_64 6403 VM_ENTRY_IA32E_MODE | 6404 #endif 6405 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 6406 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 6407 msrs->entry_ctls_high |= 6408 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6409 6410 /* We support free control of debug control loading. */ 6411 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6412 6413 /* cpu-based controls */ 6414 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6415 msrs->procbased_ctls_low, 6416 msrs->procbased_ctls_high); 6417 msrs->procbased_ctls_low = 6418 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6419 msrs->procbased_ctls_high &= 6420 CPU_BASED_INTR_WINDOW_EXITING | 6421 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6422 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6423 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6424 CPU_BASED_CR3_STORE_EXITING | 6425 #ifdef CONFIG_X86_64 6426 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6427 #endif 6428 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6429 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6430 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6431 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6432 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6433 /* 6434 * We can allow some features even when not supported by the 6435 * hardware. For example, L1 can specify an MSR bitmap - and we 6436 * can use it to avoid exits to L1 - even when L0 runs L2 6437 * without MSR bitmaps. 6438 */ 6439 msrs->procbased_ctls_high |= 6440 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6441 CPU_BASED_USE_MSR_BITMAPS; 6442 6443 /* We support free control of CR3 access interception. */ 6444 msrs->procbased_ctls_low &= 6445 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6446 6447 /* 6448 * secondary cpu-based controls. Do not include those that 6449 * depend on CPUID bits, they are added later by 6450 * vmx_vcpu_after_set_cpuid. 6451 */ 6452 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6453 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6454 msrs->secondary_ctls_low, 6455 msrs->secondary_ctls_high); 6456 6457 msrs->secondary_ctls_low = 0; 6458 msrs->secondary_ctls_high &= 6459 SECONDARY_EXEC_DESC | 6460 SECONDARY_EXEC_ENABLE_RDTSCP | 6461 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6462 SECONDARY_EXEC_WBINVD_EXITING | 6463 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6464 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6465 SECONDARY_EXEC_RDRAND_EXITING | 6466 SECONDARY_EXEC_ENABLE_INVPCID | 6467 SECONDARY_EXEC_RDSEED_EXITING | 6468 SECONDARY_EXEC_XSAVES; 6469 6470 /* 6471 * We can emulate "VMCS shadowing," even if the hardware 6472 * doesn't support it. 6473 */ 6474 msrs->secondary_ctls_high |= 6475 SECONDARY_EXEC_SHADOW_VMCS; 6476 6477 if (enable_ept) { 6478 /* nested EPT: emulate EPT also to L1 */ 6479 msrs->secondary_ctls_high |= 6480 SECONDARY_EXEC_ENABLE_EPT; 6481 msrs->ept_caps = 6482 VMX_EPT_PAGE_WALK_4_BIT | 6483 VMX_EPT_PAGE_WALK_5_BIT | 6484 VMX_EPTP_WB_BIT | 6485 VMX_EPT_INVEPT_BIT | 6486 VMX_EPT_EXECUTE_ONLY_BIT; 6487 6488 msrs->ept_caps &= ept_caps; 6489 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6490 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6491 VMX_EPT_1GB_PAGE_BIT; 6492 if (enable_ept_ad_bits) { 6493 msrs->secondary_ctls_high |= 6494 SECONDARY_EXEC_ENABLE_PML; 6495 msrs->ept_caps |= VMX_EPT_AD_BIT; 6496 } 6497 } 6498 6499 if (cpu_has_vmx_vmfunc()) { 6500 msrs->secondary_ctls_high |= 6501 SECONDARY_EXEC_ENABLE_VMFUNC; 6502 /* 6503 * Advertise EPTP switching unconditionally 6504 * since we emulate it 6505 */ 6506 if (enable_ept) 6507 msrs->vmfunc_controls = 6508 VMX_VMFUNC_EPTP_SWITCHING; 6509 } 6510 6511 /* 6512 * Old versions of KVM use the single-context version without 6513 * checking for support, so declare that it is supported even 6514 * though it is treated as global context. The alternative is 6515 * not failing the single-context invvpid, and it is worse. 6516 */ 6517 if (enable_vpid) { 6518 msrs->secondary_ctls_high |= 6519 SECONDARY_EXEC_ENABLE_VPID; 6520 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6521 VMX_VPID_EXTENT_SUPPORTED_MASK; 6522 } 6523 6524 if (enable_unrestricted_guest) 6525 msrs->secondary_ctls_high |= 6526 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6527 6528 if (flexpriority_enabled) 6529 msrs->secondary_ctls_high |= 6530 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6531 6532 if (enable_sgx) 6533 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6534 6535 /* miscellaneous data */ 6536 rdmsr(MSR_IA32_VMX_MISC, 6537 msrs->misc_low, 6538 msrs->misc_high); 6539 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6540 msrs->misc_low |= 6541 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6542 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6543 VMX_MISC_ACTIVITY_HLT | 6544 VMX_MISC_ACTIVITY_WAIT_SIPI; 6545 msrs->misc_high = 0; 6546 6547 /* 6548 * This MSR reports some information about VMX support. We 6549 * should return information about the VMX we emulate for the 6550 * guest, and the VMCS structure we give it - not about the 6551 * VMX support of the underlying hardware. 6552 */ 6553 msrs->basic = 6554 VMCS12_REVISION | 6555 VMX_BASIC_TRUE_CTLS | 6556 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6557 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6558 6559 if (cpu_has_vmx_basic_inout()) 6560 msrs->basic |= VMX_BASIC_INOUT; 6561 6562 /* 6563 * These MSRs specify bits which the guest must keep fixed on 6564 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6565 * We picked the standard core2 setting. 6566 */ 6567 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6568 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6569 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6570 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6571 6572 /* These MSRs specify bits which the guest must keep fixed off. */ 6573 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6574 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6575 6576 /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 6577 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; 6578 } 6579 6580 void nested_vmx_hardware_unsetup(void) 6581 { 6582 int i; 6583 6584 if (enable_shadow_vmcs) { 6585 for (i = 0; i < VMX_BITMAP_NR; i++) 6586 free_page((unsigned long)vmx_bitmap[i]); 6587 } 6588 } 6589 6590 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6591 { 6592 int i; 6593 6594 if (!cpu_has_vmx_shadow_vmcs()) 6595 enable_shadow_vmcs = 0; 6596 if (enable_shadow_vmcs) { 6597 for (i = 0; i < VMX_BITMAP_NR; i++) { 6598 /* 6599 * The vmx_bitmap is not tied to a VM and so should 6600 * not be charged to a memcg. 6601 */ 6602 vmx_bitmap[i] = (unsigned long *) 6603 __get_free_page(GFP_KERNEL); 6604 if (!vmx_bitmap[i]) { 6605 nested_vmx_hardware_unsetup(); 6606 return -ENOMEM; 6607 } 6608 } 6609 6610 init_vmcs_shadow_fields(); 6611 } 6612 6613 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6614 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6615 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6616 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6617 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6618 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6619 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6620 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; 6621 exit_handlers[EXIT_REASON_VMON] = handle_vmon; 6622 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6623 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6624 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6625 6626 return 0; 6627 } 6628 6629 struct kvm_x86_nested_ops vmx_nested_ops = { 6630 .check_events = vmx_check_nested_events, 6631 .hv_timer_pending = nested_vmx_preemption_timer_pending, 6632 .triple_fault = nested_vmx_triple_fault, 6633 .get_state = vmx_get_nested_state, 6634 .set_state = vmx_set_nested_state, 6635 .get_nested_state_pages = vmx_get_nested_state_pages, 6636 .write_log_dirty = nested_vmx_write_pml_buffer, 6637 .enable_evmcs = nested_enable_evmcs, 6638 .get_evmcs_version = nested_get_evmcs_version, 6639 }; 6640