1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/frame.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "pmu.h" 14 #include "trace.h" 15 #include "x86.h" 16 17 static bool __read_mostly enable_shadow_vmcs = 1; 18 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 19 20 static bool __read_mostly nested_early_check = 0; 21 module_param(nested_early_check, bool, S_IRUGO); 22 23 #define CC(consistency_check) \ 24 ({ \ 25 bool failed = (consistency_check); \ 26 if (failed) \ 27 trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ 28 failed; \ 29 }) 30 31 /* 32 * Hyper-V requires all of these, so mark them as supported even though 33 * they are just treated the same as all-context. 34 */ 35 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 36 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 37 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 40 41 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 42 43 enum { 44 VMX_VMREAD_BITMAP, 45 VMX_VMWRITE_BITMAP, 46 VMX_BITMAP_NR 47 }; 48 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 49 50 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 51 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 52 53 struct shadow_vmcs_field { 54 u16 encoding; 55 u16 offset; 56 }; 57 static struct shadow_vmcs_field shadow_read_only_fields[] = { 58 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 59 #include "vmcs_shadow_fields.h" 60 }; 61 static int max_shadow_read_only_fields = 62 ARRAY_SIZE(shadow_read_only_fields); 63 64 static struct shadow_vmcs_field shadow_read_write_fields[] = { 65 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 66 #include "vmcs_shadow_fields.h" 67 }; 68 static int max_shadow_read_write_fields = 69 ARRAY_SIZE(shadow_read_write_fields); 70 71 static void init_vmcs_shadow_fields(void) 72 { 73 int i, j; 74 75 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 76 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 77 78 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 79 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 80 u16 field = entry.encoding; 81 82 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 83 (i + 1 == max_shadow_read_only_fields || 84 shadow_read_only_fields[i + 1].encoding != field + 1)) 85 pr_err("Missing field from shadow_read_only_field %x\n", 86 field + 1); 87 88 clear_bit(field, vmx_vmread_bitmap); 89 if (field & 1) 90 #ifdef CONFIG_X86_64 91 continue; 92 #else 93 entry.offset += sizeof(u32); 94 #endif 95 shadow_read_only_fields[j++] = entry; 96 } 97 max_shadow_read_only_fields = j; 98 99 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 100 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 101 u16 field = entry.encoding; 102 103 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 104 (i + 1 == max_shadow_read_write_fields || 105 shadow_read_write_fields[i + 1].encoding != field + 1)) 106 pr_err("Missing field from shadow_read_write_field %x\n", 107 field + 1); 108 109 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 110 field <= GUEST_TR_AR_BYTES, 111 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 112 113 /* 114 * PML and the preemption timer can be emulated, but the 115 * processor cannot vmwrite to fields that don't exist 116 * on bare metal. 117 */ 118 switch (field) { 119 case GUEST_PML_INDEX: 120 if (!cpu_has_vmx_pml()) 121 continue; 122 break; 123 case VMX_PREEMPTION_TIMER_VALUE: 124 if (!cpu_has_vmx_preemption_timer()) 125 continue; 126 break; 127 case GUEST_INTR_STATUS: 128 if (!cpu_has_vmx_apicv()) 129 continue; 130 break; 131 default: 132 break; 133 } 134 135 clear_bit(field, vmx_vmwrite_bitmap); 136 clear_bit(field, vmx_vmread_bitmap); 137 if (field & 1) 138 #ifdef CONFIG_X86_64 139 continue; 140 #else 141 entry.offset += sizeof(u32); 142 #endif 143 shadow_read_write_fields[j++] = entry; 144 } 145 max_shadow_read_write_fields = j; 146 } 147 148 /* 149 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 150 * set the success or error code of an emulated VMX instruction (as specified 151 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 152 * instruction. 153 */ 154 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 155 { 156 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 157 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 158 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 159 return kvm_skip_emulated_instruction(vcpu); 160 } 161 162 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 163 { 164 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 165 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 166 X86_EFLAGS_SF | X86_EFLAGS_OF)) 167 | X86_EFLAGS_CF); 168 return kvm_skip_emulated_instruction(vcpu); 169 } 170 171 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 172 u32 vm_instruction_error) 173 { 174 struct vcpu_vmx *vmx = to_vmx(vcpu); 175 176 /* 177 * failValid writes the error number to the current VMCS, which 178 * can't be done if there isn't a current VMCS. 179 */ 180 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) 181 return nested_vmx_failInvalid(vcpu); 182 183 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 184 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 185 X86_EFLAGS_SF | X86_EFLAGS_OF)) 186 | X86_EFLAGS_ZF); 187 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 188 /* 189 * We don't need to force a shadow sync because 190 * VM_INSTRUCTION_ERROR is not shadowed 191 */ 192 return kvm_skip_emulated_instruction(vcpu); 193 } 194 195 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 196 { 197 /* TODO: not to reset guest simply here. */ 198 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 199 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 200 } 201 202 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 203 { 204 return fixed_bits_valid(control, low, high); 205 } 206 207 static inline u64 vmx_control_msr(u32 low, u32 high) 208 { 209 return low | ((u64)high << 32); 210 } 211 212 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 213 { 214 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 215 vmcs_write64(VMCS_LINK_POINTER, -1ull); 216 vmx->nested.need_vmcs12_to_shadow_sync = false; 217 } 218 219 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 220 { 221 struct vcpu_vmx *vmx = to_vmx(vcpu); 222 223 if (!vmx->nested.hv_evmcs) 224 return; 225 226 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 227 vmx->nested.hv_evmcs_vmptr = 0; 228 vmx->nested.hv_evmcs = NULL; 229 } 230 231 /* 232 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 233 * just stops using VMX. 234 */ 235 static void free_nested(struct kvm_vcpu *vcpu) 236 { 237 struct vcpu_vmx *vmx = to_vmx(vcpu); 238 239 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 240 return; 241 242 kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 243 244 vmx->nested.vmxon = false; 245 vmx->nested.smm.vmxon = false; 246 free_vpid(vmx->nested.vpid02); 247 vmx->nested.posted_intr_nv = -1; 248 vmx->nested.current_vmptr = -1ull; 249 if (enable_shadow_vmcs) { 250 vmx_disable_shadow_vmcs(vmx); 251 vmcs_clear(vmx->vmcs01.shadow_vmcs); 252 free_vmcs(vmx->vmcs01.shadow_vmcs); 253 vmx->vmcs01.shadow_vmcs = NULL; 254 } 255 kfree(vmx->nested.cached_vmcs12); 256 vmx->nested.cached_vmcs12 = NULL; 257 kfree(vmx->nested.cached_shadow_vmcs12); 258 vmx->nested.cached_shadow_vmcs12 = NULL; 259 /* Unpin physical memory we referred to in the vmcs02 */ 260 if (vmx->nested.apic_access_page) { 261 kvm_release_page_clean(vmx->nested.apic_access_page); 262 vmx->nested.apic_access_page = NULL; 263 } 264 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 265 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 266 vmx->nested.pi_desc = NULL; 267 268 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 269 270 nested_release_evmcs(vcpu); 271 272 free_loaded_vmcs(&vmx->nested.vmcs02); 273 } 274 275 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 276 struct loaded_vmcs *prev) 277 { 278 struct vmcs_host_state *dest, *src; 279 280 if (unlikely(!vmx->guest_state_loaded)) 281 return; 282 283 src = &prev->host_state; 284 dest = &vmx->loaded_vmcs->host_state; 285 286 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 287 dest->ldt_sel = src->ldt_sel; 288 #ifdef CONFIG_X86_64 289 dest->ds_sel = src->ds_sel; 290 dest->es_sel = src->es_sel; 291 #endif 292 } 293 294 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 295 { 296 struct vcpu_vmx *vmx = to_vmx(vcpu); 297 struct loaded_vmcs *prev; 298 int cpu; 299 300 if (vmx->loaded_vmcs == vmcs) 301 return; 302 303 cpu = get_cpu(); 304 prev = vmx->loaded_vmcs; 305 vmx->loaded_vmcs = vmcs; 306 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 307 vmx_sync_vmcs_host_state(vmx, prev); 308 put_cpu(); 309 310 vmx_register_cache_reset(vcpu); 311 } 312 313 /* 314 * Ensure that the current vmcs of the logical processor is the 315 * vmcs01 of the vcpu before calling free_nested(). 316 */ 317 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 318 { 319 vcpu_load(vcpu); 320 vmx_leave_nested(vcpu); 321 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); 322 free_nested(vcpu); 323 vcpu_put(vcpu); 324 } 325 326 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 327 struct x86_exception *fault) 328 { 329 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 330 struct vcpu_vmx *vmx = to_vmx(vcpu); 331 u32 vm_exit_reason; 332 unsigned long exit_qualification = vcpu->arch.exit_qualification; 333 334 if (vmx->nested.pml_full) { 335 vm_exit_reason = EXIT_REASON_PML_FULL; 336 vmx->nested.pml_full = false; 337 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 338 } else if (fault->error_code & PFERR_RSVD_MASK) 339 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 340 else 341 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 342 343 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 344 vmcs12->guest_physical_address = fault->address; 345 } 346 347 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 348 { 349 WARN_ON(mmu_is_nested(vcpu)); 350 351 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 352 kvm_init_shadow_ept_mmu(vcpu, 353 to_vmx(vcpu)->nested.msrs.ept_caps & 354 VMX_EPT_EXECUTE_ONLY_BIT, 355 nested_ept_ad_enabled(vcpu), 356 nested_ept_get_eptp(vcpu)); 357 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 358 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 359 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 360 361 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 362 } 363 364 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 365 { 366 vcpu->arch.mmu = &vcpu->arch.root_mmu; 367 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 368 } 369 370 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 371 u16 error_code) 372 { 373 bool inequality, bit; 374 375 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 376 inequality = 377 (error_code & vmcs12->page_fault_error_code_mask) != 378 vmcs12->page_fault_error_code_match; 379 return inequality ^ bit; 380 } 381 382 383 /* 384 * KVM wants to inject page-faults which it got to the guest. This function 385 * checks whether in a nested guest, we need to inject them to L1 or L2. 386 */ 387 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 388 { 389 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 390 unsigned int nr = vcpu->arch.exception.nr; 391 bool has_payload = vcpu->arch.exception.has_payload; 392 unsigned long payload = vcpu->arch.exception.payload; 393 394 if (nr == PF_VECTOR) { 395 if (vcpu->arch.exception.nested_apf) { 396 *exit_qual = vcpu->arch.apf.nested_apf_token; 397 return 1; 398 } 399 if (nested_vmx_is_page_fault_vmexit(vmcs12, 400 vcpu->arch.exception.error_code)) { 401 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 402 return 1; 403 } 404 } else if (vmcs12->exception_bitmap & (1u << nr)) { 405 if (nr == DB_VECTOR) { 406 if (!has_payload) { 407 payload = vcpu->arch.dr6; 408 payload &= ~(DR6_FIXED_1 | DR6_BT); 409 payload ^= DR6_RTM; 410 } 411 *exit_qual = payload; 412 } else 413 *exit_qual = 0; 414 return 1; 415 } 416 417 return 0; 418 } 419 420 421 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 422 struct x86_exception *fault) 423 { 424 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 425 426 WARN_ON(!is_guest_mode(vcpu)); 427 428 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 429 !to_vmx(vcpu)->nested.nested_run_pending) { 430 vmcs12->vm_exit_intr_error_code = fault->error_code; 431 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 432 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 433 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 434 fault->address); 435 } else { 436 kvm_inject_page_fault(vcpu, fault); 437 } 438 } 439 440 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 441 struct vmcs12 *vmcs12) 442 { 443 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 444 return 0; 445 446 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 447 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 448 return -EINVAL; 449 450 return 0; 451 } 452 453 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 454 struct vmcs12 *vmcs12) 455 { 456 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 457 return 0; 458 459 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 460 return -EINVAL; 461 462 return 0; 463 } 464 465 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 466 struct vmcs12 *vmcs12) 467 { 468 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 469 return 0; 470 471 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 472 return -EINVAL; 473 474 return 0; 475 } 476 477 /* 478 * Check if MSR is intercepted for L01 MSR bitmap. 479 */ 480 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) 481 { 482 unsigned long *msr_bitmap; 483 int f = sizeof(unsigned long); 484 485 if (!cpu_has_vmx_msr_bitmap()) 486 return true; 487 488 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 489 490 if (msr <= 0x1fff) { 491 return !!test_bit(msr, msr_bitmap + 0x800 / f); 492 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 493 msr &= 0x1fff; 494 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 495 } 496 497 return true; 498 } 499 500 /* 501 * If a msr is allowed by L0, we should check whether it is allowed by L1. 502 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 503 */ 504 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 505 unsigned long *msr_bitmap_nested, 506 u32 msr, int type) 507 { 508 int f = sizeof(unsigned long); 509 510 /* 511 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 512 * have the write-low and read-high bitmap offsets the wrong way round. 513 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 514 */ 515 if (msr <= 0x1fff) { 516 if (type & MSR_TYPE_R && 517 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 518 /* read-low */ 519 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 520 521 if (type & MSR_TYPE_W && 522 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 523 /* write-low */ 524 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 525 526 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 527 msr &= 0x1fff; 528 if (type & MSR_TYPE_R && 529 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 530 /* read-high */ 531 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 532 533 if (type & MSR_TYPE_W && 534 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 535 /* write-high */ 536 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 537 538 } 539 } 540 541 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 542 { 543 int msr; 544 545 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 546 unsigned word = msr / BITS_PER_LONG; 547 548 msr_bitmap[word] = ~0; 549 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 550 } 551 } 552 553 /* 554 * Merge L0's and L1's MSR bitmap, return false to indicate that 555 * we do not use the hardware. 556 */ 557 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 558 struct vmcs12 *vmcs12) 559 { 560 int msr; 561 unsigned long *msr_bitmap_l1; 562 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 563 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; 564 565 /* Nothing to do if the MSR bitmap is not in use. */ 566 if (!cpu_has_vmx_msr_bitmap() || 567 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 568 return false; 569 570 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 571 return false; 572 573 msr_bitmap_l1 = (unsigned long *)map->hva; 574 575 /* 576 * To keep the control flow simple, pay eight 8-byte writes (sixteen 577 * 4-byte writes on 32-bit systems) up front to enable intercepts for 578 * the x2APIC MSR range and selectively disable them below. 579 */ 580 enable_x2apic_msr_intercepts(msr_bitmap_l0); 581 582 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 583 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 584 /* 585 * L0 need not intercept reads for MSRs between 0x800 586 * and 0x8ff, it just lets the processor take the value 587 * from the virtual-APIC page; take those 256 bits 588 * directly from the L1 bitmap. 589 */ 590 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 591 unsigned word = msr / BITS_PER_LONG; 592 593 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 594 } 595 } 596 597 nested_vmx_disable_intercept_for_msr( 598 msr_bitmap_l1, msr_bitmap_l0, 599 X2APIC_MSR(APIC_TASKPRI), 600 MSR_TYPE_R | MSR_TYPE_W); 601 602 if (nested_cpu_has_vid(vmcs12)) { 603 nested_vmx_disable_intercept_for_msr( 604 msr_bitmap_l1, msr_bitmap_l0, 605 X2APIC_MSR(APIC_EOI), 606 MSR_TYPE_W); 607 nested_vmx_disable_intercept_for_msr( 608 msr_bitmap_l1, msr_bitmap_l0, 609 X2APIC_MSR(APIC_SELF_IPI), 610 MSR_TYPE_W); 611 } 612 } 613 614 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ 615 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 616 MSR_FS_BASE, MSR_TYPE_RW); 617 618 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 619 MSR_GS_BASE, MSR_TYPE_RW); 620 621 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 622 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 623 624 /* 625 * Checking the L0->L1 bitmap is trying to verify two things: 626 * 627 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This 628 * ensures that we do not accidentally generate an L02 MSR bitmap 629 * from the L12 MSR bitmap that is too permissive. 630 * 2. That L1 or L2s have actually used the MSR. This avoids 631 * unnecessarily merging of the bitmap if the MSR is unused. This 632 * works properly because we only update the L01 MSR bitmap lazily. 633 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only 634 * updated to reflect this when L1 (or its L2s) actually write to 635 * the MSR. 636 */ 637 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) 638 nested_vmx_disable_intercept_for_msr( 639 msr_bitmap_l1, msr_bitmap_l0, 640 MSR_IA32_SPEC_CTRL, 641 MSR_TYPE_R | MSR_TYPE_W); 642 643 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) 644 nested_vmx_disable_intercept_for_msr( 645 msr_bitmap_l1, msr_bitmap_l0, 646 MSR_IA32_PRED_CMD, 647 MSR_TYPE_W); 648 649 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); 650 651 return true; 652 } 653 654 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 655 struct vmcs12 *vmcs12) 656 { 657 struct kvm_host_map map; 658 struct vmcs12 *shadow; 659 660 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 661 vmcs12->vmcs_link_pointer == -1ull) 662 return; 663 664 shadow = get_shadow_vmcs12(vcpu); 665 666 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 667 return; 668 669 memcpy(shadow, map.hva, VMCS12_SIZE); 670 kvm_vcpu_unmap(vcpu, &map, false); 671 } 672 673 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 674 struct vmcs12 *vmcs12) 675 { 676 struct vcpu_vmx *vmx = to_vmx(vcpu); 677 678 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 679 vmcs12->vmcs_link_pointer == -1ull) 680 return; 681 682 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 683 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 684 } 685 686 /* 687 * In nested virtualization, check if L1 has set 688 * VM_EXIT_ACK_INTR_ON_EXIT 689 */ 690 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 691 { 692 return get_vmcs12(vcpu)->vm_exit_controls & 693 VM_EXIT_ACK_INTR_ON_EXIT; 694 } 695 696 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 697 struct vmcs12 *vmcs12) 698 { 699 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 700 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 701 return -EINVAL; 702 else 703 return 0; 704 } 705 706 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 707 struct vmcs12 *vmcs12) 708 { 709 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 710 !nested_cpu_has_apic_reg_virt(vmcs12) && 711 !nested_cpu_has_vid(vmcs12) && 712 !nested_cpu_has_posted_intr(vmcs12)) 713 return 0; 714 715 /* 716 * If virtualize x2apic mode is enabled, 717 * virtualize apic access must be disabled. 718 */ 719 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 720 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 721 return -EINVAL; 722 723 /* 724 * If virtual interrupt delivery is enabled, 725 * we must exit on external interrupts. 726 */ 727 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 728 return -EINVAL; 729 730 /* 731 * bits 15:8 should be zero in posted_intr_nv, 732 * the descriptor address has been already checked 733 * in nested_get_vmcs12_pages. 734 * 735 * bits 5:0 of posted_intr_desc_addr should be zero. 736 */ 737 if (nested_cpu_has_posted_intr(vmcs12) && 738 (CC(!nested_cpu_has_vid(vmcs12)) || 739 CC(!nested_exit_intr_ack_set(vcpu)) || 740 CC((vmcs12->posted_intr_nv & 0xff00)) || 741 CC((vmcs12->posted_intr_desc_addr & 0x3f)) || 742 CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) 743 return -EINVAL; 744 745 /* tpr shadow is needed by all apicv features. */ 746 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 747 return -EINVAL; 748 749 return 0; 750 } 751 752 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 753 u32 count, u64 addr) 754 { 755 int maxphyaddr; 756 757 if (count == 0) 758 return 0; 759 maxphyaddr = cpuid_maxphyaddr(vcpu); 760 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 761 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) 762 return -EINVAL; 763 764 return 0; 765 } 766 767 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 768 struct vmcs12 *vmcs12) 769 { 770 if (CC(nested_vmx_check_msr_switch(vcpu, 771 vmcs12->vm_exit_msr_load_count, 772 vmcs12->vm_exit_msr_load_addr)) || 773 CC(nested_vmx_check_msr_switch(vcpu, 774 vmcs12->vm_exit_msr_store_count, 775 vmcs12->vm_exit_msr_store_addr))) 776 return -EINVAL; 777 778 return 0; 779 } 780 781 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 782 struct vmcs12 *vmcs12) 783 { 784 if (CC(nested_vmx_check_msr_switch(vcpu, 785 vmcs12->vm_entry_msr_load_count, 786 vmcs12->vm_entry_msr_load_addr))) 787 return -EINVAL; 788 789 return 0; 790 } 791 792 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 793 struct vmcs12 *vmcs12) 794 { 795 if (!nested_cpu_has_pml(vmcs12)) 796 return 0; 797 798 if (CC(!nested_cpu_has_ept(vmcs12)) || 799 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 800 return -EINVAL; 801 802 return 0; 803 } 804 805 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 806 struct vmcs12 *vmcs12) 807 { 808 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 809 !nested_cpu_has_ept(vmcs12))) 810 return -EINVAL; 811 return 0; 812 } 813 814 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 815 struct vmcs12 *vmcs12) 816 { 817 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 818 !nested_cpu_has_ept(vmcs12))) 819 return -EINVAL; 820 return 0; 821 } 822 823 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 824 struct vmcs12 *vmcs12) 825 { 826 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 827 return 0; 828 829 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 830 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 831 return -EINVAL; 832 833 return 0; 834 } 835 836 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 837 struct vmx_msr_entry *e) 838 { 839 /* x2APIC MSR accesses are not allowed */ 840 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 841 return -EINVAL; 842 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 843 CC(e->index == MSR_IA32_UCODE_REV)) 844 return -EINVAL; 845 if (CC(e->reserved != 0)) 846 return -EINVAL; 847 return 0; 848 } 849 850 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 851 struct vmx_msr_entry *e) 852 { 853 if (CC(e->index == MSR_FS_BASE) || 854 CC(e->index == MSR_GS_BASE) || 855 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 856 nested_vmx_msr_check_common(vcpu, e)) 857 return -EINVAL; 858 return 0; 859 } 860 861 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 862 struct vmx_msr_entry *e) 863 { 864 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 865 nested_vmx_msr_check_common(vcpu, e)) 866 return -EINVAL; 867 return 0; 868 } 869 870 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 871 { 872 struct vcpu_vmx *vmx = to_vmx(vcpu); 873 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 874 vmx->nested.msrs.misc_high); 875 876 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 877 } 878 879 /* 880 * Load guest's/host's msr at nested entry/exit. 881 * return 0 for success, entry index for failure. 882 * 883 * One of the failure modes for MSR load/store is when a list exceeds the 884 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 885 * as possible, process all valid entries before failing rather than precheck 886 * for a capacity violation. 887 */ 888 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 889 { 890 u32 i; 891 struct vmx_msr_entry e; 892 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 893 894 for (i = 0; i < count; i++) { 895 if (unlikely(i >= max_msr_list_size)) 896 goto fail; 897 898 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 899 &e, sizeof(e))) { 900 pr_debug_ratelimited( 901 "%s cannot read MSR entry (%u, 0x%08llx)\n", 902 __func__, i, gpa + i * sizeof(e)); 903 goto fail; 904 } 905 if (nested_vmx_load_msr_check(vcpu, &e)) { 906 pr_debug_ratelimited( 907 "%s check failed (%u, 0x%x, 0x%x)\n", 908 __func__, i, e.index, e.reserved); 909 goto fail; 910 } 911 if (kvm_set_msr(vcpu, e.index, e.value)) { 912 pr_debug_ratelimited( 913 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 914 __func__, i, e.index, e.value); 915 goto fail; 916 } 917 } 918 return 0; 919 fail: 920 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 921 return i + 1; 922 } 923 924 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 925 u32 msr_index, 926 u64 *data) 927 { 928 struct vcpu_vmx *vmx = to_vmx(vcpu); 929 930 /* 931 * If the L0 hypervisor stored a more accurate value for the TSC that 932 * does not include the time taken for emulation of the L2->L1 933 * VM-exit in L0, use the more accurate value. 934 */ 935 if (msr_index == MSR_IA32_TSC) { 936 int index = vmx_find_msr_index(&vmx->msr_autostore.guest, 937 MSR_IA32_TSC); 938 939 if (index >= 0) { 940 u64 val = vmx->msr_autostore.guest.val[index].value; 941 942 *data = kvm_read_l1_tsc(vcpu, val); 943 return true; 944 } 945 } 946 947 if (kvm_get_msr(vcpu, msr_index, data)) { 948 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 949 msr_index); 950 return false; 951 } 952 return true; 953 } 954 955 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 956 struct vmx_msr_entry *e) 957 { 958 if (kvm_vcpu_read_guest(vcpu, 959 gpa + i * sizeof(*e), 960 e, 2 * sizeof(u32))) { 961 pr_debug_ratelimited( 962 "%s cannot read MSR entry (%u, 0x%08llx)\n", 963 __func__, i, gpa + i * sizeof(*e)); 964 return false; 965 } 966 if (nested_vmx_store_msr_check(vcpu, e)) { 967 pr_debug_ratelimited( 968 "%s check failed (%u, 0x%x, 0x%x)\n", 969 __func__, i, e->index, e->reserved); 970 return false; 971 } 972 return true; 973 } 974 975 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 976 { 977 u64 data; 978 u32 i; 979 struct vmx_msr_entry e; 980 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 981 982 for (i = 0; i < count; i++) { 983 if (unlikely(i >= max_msr_list_size)) 984 return -EINVAL; 985 986 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 987 return -EINVAL; 988 989 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 990 return -EINVAL; 991 992 if (kvm_vcpu_write_guest(vcpu, 993 gpa + i * sizeof(e) + 994 offsetof(struct vmx_msr_entry, value), 995 &data, sizeof(data))) { 996 pr_debug_ratelimited( 997 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 998 __func__, i, e.index, data); 999 return -EINVAL; 1000 } 1001 } 1002 return 0; 1003 } 1004 1005 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1006 { 1007 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1008 u32 count = vmcs12->vm_exit_msr_store_count; 1009 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1010 struct vmx_msr_entry e; 1011 u32 i; 1012 1013 for (i = 0; i < count; i++) { 1014 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1015 return false; 1016 1017 if (e.index == msr_index) 1018 return true; 1019 } 1020 return false; 1021 } 1022 1023 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1024 u32 msr_index) 1025 { 1026 struct vcpu_vmx *vmx = to_vmx(vcpu); 1027 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1028 bool in_vmcs12_store_list; 1029 int msr_autostore_index; 1030 bool in_autostore_list; 1031 int last; 1032 1033 msr_autostore_index = vmx_find_msr_index(autostore, msr_index); 1034 in_autostore_list = msr_autostore_index >= 0; 1035 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1036 1037 if (in_vmcs12_store_list && !in_autostore_list) { 1038 if (autostore->nr == NR_LOADSTORE_MSRS) { 1039 /* 1040 * Emulated VMEntry does not fail here. Instead a less 1041 * accurate value will be returned by 1042 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1043 * instead of reading the value from the vmcs02 VMExit 1044 * MSR-store area. 1045 */ 1046 pr_warn_ratelimited( 1047 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1048 msr_index); 1049 return; 1050 } 1051 last = autostore->nr++; 1052 autostore->val[last].index = msr_index; 1053 } else if (!in_vmcs12_store_list && in_autostore_list) { 1054 last = --autostore->nr; 1055 autostore->val[msr_autostore_index] = autostore->val[last]; 1056 } 1057 } 1058 1059 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) 1060 { 1061 unsigned long invalid_mask; 1062 1063 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); 1064 return (val & invalid_mask) == 0; 1065 } 1066 1067 /* 1068 * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit. 1069 * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't 1070 * enable VPID for L2 (implying it expects a TLB flush on VMX transitions). 1071 * Here's why. 1072 * 1073 * If EPT is enabled by L0 a sync is never needed: 1074 * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there 1075 * cannot be unsync'd SPTEs for either L1 or L2. 1076 * 1077 * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter 1078 * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings 1079 * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush 1080 * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't 1081 * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit. 1082 * 1083 * If EPT is disabled by L0: 1084 * - if VPID is enabled by L1 (for L2), the situation is similar to when L1 1085 * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't 1086 * required to invalidate linear mappings (EPT is disabled so there are 1087 * no combined or guest-physical mappings), i.e. L1 can't rely on the 1088 * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1). 1089 * 1090 * - however if VPID is disabled by L1, then a sync is needed as L1 expects all 1091 * linear mappings (EPT is disabled so there are no combined or guest-physical 1092 * mappings) to be invalidated on both VM-Enter and VM-Exit. 1093 * 1094 * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which 1095 * additionally checks that L2 has been assigned a VPID (when EPT is disabled). 1096 * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect 1097 * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2 1098 * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has 1099 * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1 1100 * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't 1101 * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush 1102 * stale TLB entries, at which point L0 will sync L2's MMU. 1103 */ 1104 static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) 1105 { 1106 return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu)); 1107 } 1108 1109 /* 1110 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1111 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1112 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1113 * @entry_failure_code. 1114 */ 1115 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, 1116 enum vm_entry_failure_code *entry_failure_code) 1117 { 1118 if (CC(!nested_cr3_valid(vcpu, cr3))) { 1119 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1120 return -EINVAL; 1121 } 1122 1123 /* 1124 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1125 * must not be dereferenced. 1126 */ 1127 if (!nested_ept && is_pae_paging(vcpu) && 1128 (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { 1129 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1130 *entry_failure_code = ENTRY_FAIL_PDPTE; 1131 return -EINVAL; 1132 } 1133 } 1134 1135 /* 1136 * Unconditionally skip the TLB flush on fast CR3 switch, all TLB 1137 * flushes are handled by nested_vmx_transition_tlb_flush(). See 1138 * nested_vmx_transition_mmu_sync for details on skipping the MMU sync. 1139 */ 1140 if (!nested_ept) 1141 kvm_mmu_new_pgd(vcpu, cr3, true, 1142 !nested_vmx_transition_mmu_sync(vcpu)); 1143 1144 vcpu->arch.cr3 = cr3; 1145 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1146 1147 kvm_init_mmu(vcpu, false); 1148 1149 return 0; 1150 } 1151 1152 /* 1153 * Returns if KVM is able to config CPU to tag TLB entries 1154 * populated by L2 differently than TLB entries populated 1155 * by L1. 1156 * 1157 * If L0 uses EPT, L1 and L2 run with different EPTP because 1158 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1159 * are tagged with different EPTP. 1160 * 1161 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1162 * with different VPID (L1 entries are tagged with vmx->vpid 1163 * while L2 entries are tagged with vmx->nested.vpid02). 1164 */ 1165 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1166 { 1167 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1168 1169 return enable_ept || 1170 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1171 } 1172 1173 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1174 struct vmcs12 *vmcs12, 1175 bool is_vmenter) 1176 { 1177 struct vcpu_vmx *vmx = to_vmx(vcpu); 1178 1179 /* 1180 * If VPID is disabled, linear and combined mappings are flushed on 1181 * VM-Enter/VM-Exit, and guest-physical mappings are valid only for 1182 * their associated EPTP. 1183 */ 1184 if (!enable_vpid) 1185 return; 1186 1187 /* 1188 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1189 * for *all* contexts to be flushed on VM-Enter/VM-Exit. 1190 * 1191 * If VPID is enabled and used by vmc12, but L2 does not have a unique 1192 * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate 1193 * a VPID for L2, flush the current context as the effective ASID is 1194 * common to both L1 and L2. 1195 * 1196 * Defer the flush so that it runs after vmcs02.EPTP has been set by 1197 * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid 1198 * redundant flushes further down the nested pipeline. 1199 * 1200 * If a TLB flush isn't required due to any of the above, and vpid12 is 1201 * changing then the new "virtual" VPID (vpid12) will reuse the same 1202 * "real" VPID (vpid02), and so needs to be sync'd. There is no direct 1203 * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for 1204 * all nested vCPUs. 1205 */ 1206 if (!nested_cpu_has_vpid(vmcs12)) { 1207 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1208 } else if (!nested_has_guest_tlb_tag(vcpu)) { 1209 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1210 } else if (is_vmenter && 1211 vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1212 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1213 vpid_sync_context(nested_get_vpid02(vcpu)); 1214 } 1215 } 1216 1217 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1218 { 1219 superset &= mask; 1220 subset &= mask; 1221 1222 return (superset | subset) == superset; 1223 } 1224 1225 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1226 { 1227 const u64 feature_and_reserved = 1228 /* feature (except bit 48; see below) */ 1229 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1230 /* reserved */ 1231 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1232 u64 vmx_basic = vmx->nested.msrs.basic; 1233 1234 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1235 return -EINVAL; 1236 1237 /* 1238 * KVM does not emulate a version of VMX that constrains physical 1239 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1240 */ 1241 if (data & BIT_ULL(48)) 1242 return -EINVAL; 1243 1244 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1245 vmx_basic_vmcs_revision_id(data)) 1246 return -EINVAL; 1247 1248 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1249 return -EINVAL; 1250 1251 vmx->nested.msrs.basic = data; 1252 return 0; 1253 } 1254 1255 static int 1256 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1257 { 1258 u64 supported; 1259 u32 *lowp, *highp; 1260 1261 switch (msr_index) { 1262 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1263 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1264 highp = &vmx->nested.msrs.pinbased_ctls_high; 1265 break; 1266 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1267 lowp = &vmx->nested.msrs.procbased_ctls_low; 1268 highp = &vmx->nested.msrs.procbased_ctls_high; 1269 break; 1270 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1271 lowp = &vmx->nested.msrs.exit_ctls_low; 1272 highp = &vmx->nested.msrs.exit_ctls_high; 1273 break; 1274 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1275 lowp = &vmx->nested.msrs.entry_ctls_low; 1276 highp = &vmx->nested.msrs.entry_ctls_high; 1277 break; 1278 case MSR_IA32_VMX_PROCBASED_CTLS2: 1279 lowp = &vmx->nested.msrs.secondary_ctls_low; 1280 highp = &vmx->nested.msrs.secondary_ctls_high; 1281 break; 1282 default: 1283 BUG(); 1284 } 1285 1286 supported = vmx_control_msr(*lowp, *highp); 1287 1288 /* Check must-be-1 bits are still 1. */ 1289 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1290 return -EINVAL; 1291 1292 /* Check must-be-0 bits are still 0. */ 1293 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1294 return -EINVAL; 1295 1296 *lowp = data; 1297 *highp = data >> 32; 1298 return 0; 1299 } 1300 1301 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1302 { 1303 const u64 feature_and_reserved_bits = 1304 /* feature */ 1305 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1306 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1307 /* reserved */ 1308 GENMASK_ULL(13, 9) | BIT_ULL(31); 1309 u64 vmx_misc; 1310 1311 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1312 vmx->nested.msrs.misc_high); 1313 1314 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1315 return -EINVAL; 1316 1317 if ((vmx->nested.msrs.pinbased_ctls_high & 1318 PIN_BASED_VMX_PREEMPTION_TIMER) && 1319 vmx_misc_preemption_timer_rate(data) != 1320 vmx_misc_preemption_timer_rate(vmx_misc)) 1321 return -EINVAL; 1322 1323 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1324 return -EINVAL; 1325 1326 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1327 return -EINVAL; 1328 1329 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1330 return -EINVAL; 1331 1332 vmx->nested.msrs.misc_low = data; 1333 vmx->nested.msrs.misc_high = data >> 32; 1334 1335 return 0; 1336 } 1337 1338 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1339 { 1340 u64 vmx_ept_vpid_cap; 1341 1342 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1343 vmx->nested.msrs.vpid_caps); 1344 1345 /* Every bit is either reserved or a feature bit. */ 1346 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1347 return -EINVAL; 1348 1349 vmx->nested.msrs.ept_caps = data; 1350 vmx->nested.msrs.vpid_caps = data >> 32; 1351 return 0; 1352 } 1353 1354 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1355 { 1356 u64 *msr; 1357 1358 switch (msr_index) { 1359 case MSR_IA32_VMX_CR0_FIXED0: 1360 msr = &vmx->nested.msrs.cr0_fixed0; 1361 break; 1362 case MSR_IA32_VMX_CR4_FIXED0: 1363 msr = &vmx->nested.msrs.cr4_fixed0; 1364 break; 1365 default: 1366 BUG(); 1367 } 1368 1369 /* 1370 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1371 * must be 1 in the restored value. 1372 */ 1373 if (!is_bitwise_subset(data, *msr, -1ULL)) 1374 return -EINVAL; 1375 1376 *msr = data; 1377 return 0; 1378 } 1379 1380 /* 1381 * Called when userspace is restoring VMX MSRs. 1382 * 1383 * Returns 0 on success, non-0 otherwise. 1384 */ 1385 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1386 { 1387 struct vcpu_vmx *vmx = to_vmx(vcpu); 1388 1389 /* 1390 * Don't allow changes to the VMX capability MSRs while the vCPU 1391 * is in VMX operation. 1392 */ 1393 if (vmx->nested.vmxon) 1394 return -EBUSY; 1395 1396 switch (msr_index) { 1397 case MSR_IA32_VMX_BASIC: 1398 return vmx_restore_vmx_basic(vmx, data); 1399 case MSR_IA32_VMX_PINBASED_CTLS: 1400 case MSR_IA32_VMX_PROCBASED_CTLS: 1401 case MSR_IA32_VMX_EXIT_CTLS: 1402 case MSR_IA32_VMX_ENTRY_CTLS: 1403 /* 1404 * The "non-true" VMX capability MSRs are generated from the 1405 * "true" MSRs, so we do not support restoring them directly. 1406 * 1407 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1408 * should restore the "true" MSRs with the must-be-1 bits 1409 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1410 * DEFAULT SETTINGS". 1411 */ 1412 return -EINVAL; 1413 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1414 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1415 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1416 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1417 case MSR_IA32_VMX_PROCBASED_CTLS2: 1418 return vmx_restore_control_msr(vmx, msr_index, data); 1419 case MSR_IA32_VMX_MISC: 1420 return vmx_restore_vmx_misc(vmx, data); 1421 case MSR_IA32_VMX_CR0_FIXED0: 1422 case MSR_IA32_VMX_CR4_FIXED0: 1423 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1424 case MSR_IA32_VMX_CR0_FIXED1: 1425 case MSR_IA32_VMX_CR4_FIXED1: 1426 /* 1427 * These MSRs are generated based on the vCPU's CPUID, so we 1428 * do not support restoring them directly. 1429 */ 1430 return -EINVAL; 1431 case MSR_IA32_VMX_EPT_VPID_CAP: 1432 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1433 case MSR_IA32_VMX_VMCS_ENUM: 1434 vmx->nested.msrs.vmcs_enum = data; 1435 return 0; 1436 case MSR_IA32_VMX_VMFUNC: 1437 if (data & ~vmx->nested.msrs.vmfunc_controls) 1438 return -EINVAL; 1439 vmx->nested.msrs.vmfunc_controls = data; 1440 return 0; 1441 default: 1442 /* 1443 * The rest of the VMX capability MSRs do not support restore. 1444 */ 1445 return -EINVAL; 1446 } 1447 } 1448 1449 /* Returns 0 on success, non-0 otherwise. */ 1450 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1451 { 1452 switch (msr_index) { 1453 case MSR_IA32_VMX_BASIC: 1454 *pdata = msrs->basic; 1455 break; 1456 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1457 case MSR_IA32_VMX_PINBASED_CTLS: 1458 *pdata = vmx_control_msr( 1459 msrs->pinbased_ctls_low, 1460 msrs->pinbased_ctls_high); 1461 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1462 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1463 break; 1464 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1465 case MSR_IA32_VMX_PROCBASED_CTLS: 1466 *pdata = vmx_control_msr( 1467 msrs->procbased_ctls_low, 1468 msrs->procbased_ctls_high); 1469 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1470 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1471 break; 1472 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1473 case MSR_IA32_VMX_EXIT_CTLS: 1474 *pdata = vmx_control_msr( 1475 msrs->exit_ctls_low, 1476 msrs->exit_ctls_high); 1477 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1478 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1479 break; 1480 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1481 case MSR_IA32_VMX_ENTRY_CTLS: 1482 *pdata = vmx_control_msr( 1483 msrs->entry_ctls_low, 1484 msrs->entry_ctls_high); 1485 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1486 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1487 break; 1488 case MSR_IA32_VMX_MISC: 1489 *pdata = vmx_control_msr( 1490 msrs->misc_low, 1491 msrs->misc_high); 1492 break; 1493 case MSR_IA32_VMX_CR0_FIXED0: 1494 *pdata = msrs->cr0_fixed0; 1495 break; 1496 case MSR_IA32_VMX_CR0_FIXED1: 1497 *pdata = msrs->cr0_fixed1; 1498 break; 1499 case MSR_IA32_VMX_CR4_FIXED0: 1500 *pdata = msrs->cr4_fixed0; 1501 break; 1502 case MSR_IA32_VMX_CR4_FIXED1: 1503 *pdata = msrs->cr4_fixed1; 1504 break; 1505 case MSR_IA32_VMX_VMCS_ENUM: 1506 *pdata = msrs->vmcs_enum; 1507 break; 1508 case MSR_IA32_VMX_PROCBASED_CTLS2: 1509 *pdata = vmx_control_msr( 1510 msrs->secondary_ctls_low, 1511 msrs->secondary_ctls_high); 1512 break; 1513 case MSR_IA32_VMX_EPT_VPID_CAP: 1514 *pdata = msrs->ept_caps | 1515 ((u64)msrs->vpid_caps << 32); 1516 break; 1517 case MSR_IA32_VMX_VMFUNC: 1518 *pdata = msrs->vmfunc_controls; 1519 break; 1520 default: 1521 return 1; 1522 } 1523 1524 return 0; 1525 } 1526 1527 /* 1528 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1529 * been modified by the L1 guest. Note, "writable" in this context means 1530 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1531 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1532 * VM-exit information fields (which are actually writable if the vCPU is 1533 * configured to support "VMWRITE to any supported field in the VMCS"). 1534 */ 1535 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1536 { 1537 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1538 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1539 struct shadow_vmcs_field field; 1540 unsigned long val; 1541 int i; 1542 1543 if (WARN_ON(!shadow_vmcs)) 1544 return; 1545 1546 preempt_disable(); 1547 1548 vmcs_load(shadow_vmcs); 1549 1550 for (i = 0; i < max_shadow_read_write_fields; i++) { 1551 field = shadow_read_write_fields[i]; 1552 val = __vmcs_readl(field.encoding); 1553 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1554 } 1555 1556 vmcs_clear(shadow_vmcs); 1557 vmcs_load(vmx->loaded_vmcs->vmcs); 1558 1559 preempt_enable(); 1560 } 1561 1562 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1563 { 1564 const struct shadow_vmcs_field *fields[] = { 1565 shadow_read_write_fields, 1566 shadow_read_only_fields 1567 }; 1568 const int max_fields[] = { 1569 max_shadow_read_write_fields, 1570 max_shadow_read_only_fields 1571 }; 1572 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1573 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1574 struct shadow_vmcs_field field; 1575 unsigned long val; 1576 int i, q; 1577 1578 if (WARN_ON(!shadow_vmcs)) 1579 return; 1580 1581 vmcs_load(shadow_vmcs); 1582 1583 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1584 for (i = 0; i < max_fields[q]; i++) { 1585 field = fields[q][i]; 1586 val = vmcs12_read_any(vmcs12, field.encoding, 1587 field.offset); 1588 __vmcs_writel(field.encoding, val); 1589 } 1590 } 1591 1592 vmcs_clear(shadow_vmcs); 1593 vmcs_load(vmx->loaded_vmcs->vmcs); 1594 } 1595 1596 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) 1597 { 1598 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1599 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1600 1601 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1602 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1603 vmcs12->guest_rip = evmcs->guest_rip; 1604 1605 if (unlikely(!(evmcs->hv_clean_fields & 1606 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1607 vmcs12->guest_rsp = evmcs->guest_rsp; 1608 vmcs12->guest_rflags = evmcs->guest_rflags; 1609 vmcs12->guest_interruptibility_info = 1610 evmcs->guest_interruptibility_info; 1611 } 1612 1613 if (unlikely(!(evmcs->hv_clean_fields & 1614 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1615 vmcs12->cpu_based_vm_exec_control = 1616 evmcs->cpu_based_vm_exec_control; 1617 } 1618 1619 if (unlikely(!(evmcs->hv_clean_fields & 1620 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1621 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1622 } 1623 1624 if (unlikely(!(evmcs->hv_clean_fields & 1625 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1626 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1627 } 1628 1629 if (unlikely(!(evmcs->hv_clean_fields & 1630 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1631 vmcs12->vm_entry_intr_info_field = 1632 evmcs->vm_entry_intr_info_field; 1633 vmcs12->vm_entry_exception_error_code = 1634 evmcs->vm_entry_exception_error_code; 1635 vmcs12->vm_entry_instruction_len = 1636 evmcs->vm_entry_instruction_len; 1637 } 1638 1639 if (unlikely(!(evmcs->hv_clean_fields & 1640 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1641 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1642 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1643 vmcs12->host_cr0 = evmcs->host_cr0; 1644 vmcs12->host_cr3 = evmcs->host_cr3; 1645 vmcs12->host_cr4 = evmcs->host_cr4; 1646 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1647 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1648 vmcs12->host_rip = evmcs->host_rip; 1649 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1650 vmcs12->host_es_selector = evmcs->host_es_selector; 1651 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1652 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1653 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1654 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1655 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1656 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1657 } 1658 1659 if (unlikely(!(evmcs->hv_clean_fields & 1660 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1661 vmcs12->pin_based_vm_exec_control = 1662 evmcs->pin_based_vm_exec_control; 1663 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1664 vmcs12->secondary_vm_exec_control = 1665 evmcs->secondary_vm_exec_control; 1666 } 1667 1668 if (unlikely(!(evmcs->hv_clean_fields & 1669 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1670 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1671 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1672 } 1673 1674 if (unlikely(!(evmcs->hv_clean_fields & 1675 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1676 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1677 } 1678 1679 if (unlikely(!(evmcs->hv_clean_fields & 1680 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1681 vmcs12->guest_es_base = evmcs->guest_es_base; 1682 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1683 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1684 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1685 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1686 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1687 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1688 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1689 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1690 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1691 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1692 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1693 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1694 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1695 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1696 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1697 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1698 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1699 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1700 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1701 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1702 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1703 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1704 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1705 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1706 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1707 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1708 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1709 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1710 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1711 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1712 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1713 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1714 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1715 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1716 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1717 } 1718 1719 if (unlikely(!(evmcs->hv_clean_fields & 1720 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1721 vmcs12->tsc_offset = evmcs->tsc_offset; 1722 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1723 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1724 } 1725 1726 if (unlikely(!(evmcs->hv_clean_fields & 1727 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1728 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1729 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1730 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1731 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1732 vmcs12->guest_cr0 = evmcs->guest_cr0; 1733 vmcs12->guest_cr3 = evmcs->guest_cr3; 1734 vmcs12->guest_cr4 = evmcs->guest_cr4; 1735 vmcs12->guest_dr7 = evmcs->guest_dr7; 1736 } 1737 1738 if (unlikely(!(evmcs->hv_clean_fields & 1739 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1740 vmcs12->host_fs_base = evmcs->host_fs_base; 1741 vmcs12->host_gs_base = evmcs->host_gs_base; 1742 vmcs12->host_tr_base = evmcs->host_tr_base; 1743 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1744 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1745 vmcs12->host_rsp = evmcs->host_rsp; 1746 } 1747 1748 if (unlikely(!(evmcs->hv_clean_fields & 1749 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1750 vmcs12->ept_pointer = evmcs->ept_pointer; 1751 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1752 } 1753 1754 if (unlikely(!(evmcs->hv_clean_fields & 1755 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1756 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1757 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1758 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1759 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1760 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1761 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1762 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1763 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1764 vmcs12->guest_pending_dbg_exceptions = 1765 evmcs->guest_pending_dbg_exceptions; 1766 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1767 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1768 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1769 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1770 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1771 } 1772 1773 /* 1774 * Not used? 1775 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1776 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1777 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1778 * vmcs12->page_fault_error_code_mask = 1779 * evmcs->page_fault_error_code_mask; 1780 * vmcs12->page_fault_error_code_match = 1781 * evmcs->page_fault_error_code_match; 1782 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1783 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1784 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1785 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1786 */ 1787 1788 /* 1789 * Read only fields: 1790 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1791 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1792 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1793 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1794 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1795 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1796 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1797 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1798 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1799 * vmcs12->exit_qualification = evmcs->exit_qualification; 1800 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1801 * 1802 * Not present in struct vmcs12: 1803 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1804 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1805 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1806 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1807 */ 1808 1809 return 0; 1810 } 1811 1812 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1813 { 1814 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1815 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1816 1817 /* 1818 * Should not be changed by KVM: 1819 * 1820 * evmcs->host_es_selector = vmcs12->host_es_selector; 1821 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1822 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1823 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1824 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1825 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1826 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1827 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1828 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1829 * evmcs->host_cr0 = vmcs12->host_cr0; 1830 * evmcs->host_cr3 = vmcs12->host_cr3; 1831 * evmcs->host_cr4 = vmcs12->host_cr4; 1832 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1833 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1834 * evmcs->host_rip = vmcs12->host_rip; 1835 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1836 * evmcs->host_fs_base = vmcs12->host_fs_base; 1837 * evmcs->host_gs_base = vmcs12->host_gs_base; 1838 * evmcs->host_tr_base = vmcs12->host_tr_base; 1839 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1840 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1841 * evmcs->host_rsp = vmcs12->host_rsp; 1842 * sync_vmcs02_to_vmcs12() doesn't read these: 1843 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1844 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1845 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1846 * evmcs->ept_pointer = vmcs12->ept_pointer; 1847 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1848 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1849 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1850 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1851 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1852 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1853 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1854 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1855 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1856 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1857 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1858 * evmcs->page_fault_error_code_mask = 1859 * vmcs12->page_fault_error_code_mask; 1860 * evmcs->page_fault_error_code_match = 1861 * vmcs12->page_fault_error_code_match; 1862 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1863 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1864 * evmcs->tsc_offset = vmcs12->tsc_offset; 1865 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1866 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1867 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1868 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1869 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1870 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1871 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1872 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1873 * 1874 * Not present in struct vmcs12: 1875 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1876 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1877 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1878 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1879 */ 1880 1881 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1882 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1883 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1884 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1885 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1886 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1887 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1888 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1889 1890 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1891 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1892 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1893 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1894 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1895 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1896 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1897 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1898 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1899 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1900 1901 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1902 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1903 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1904 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1905 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1906 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1907 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1908 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1909 1910 evmcs->guest_es_base = vmcs12->guest_es_base; 1911 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1912 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1913 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1914 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1915 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1916 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1917 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1918 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1919 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1920 1921 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1922 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1923 1924 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1925 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1926 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1927 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1928 1929 evmcs->guest_pending_dbg_exceptions = 1930 vmcs12->guest_pending_dbg_exceptions; 1931 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1932 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1933 1934 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1935 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1936 1937 evmcs->guest_cr0 = vmcs12->guest_cr0; 1938 evmcs->guest_cr3 = vmcs12->guest_cr3; 1939 evmcs->guest_cr4 = vmcs12->guest_cr4; 1940 evmcs->guest_dr7 = vmcs12->guest_dr7; 1941 1942 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1943 1944 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1945 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1946 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1947 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1948 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1949 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1950 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1951 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1952 1953 evmcs->exit_qualification = vmcs12->exit_qualification; 1954 1955 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1956 evmcs->guest_rsp = vmcs12->guest_rsp; 1957 evmcs->guest_rflags = vmcs12->guest_rflags; 1958 1959 evmcs->guest_interruptibility_info = 1960 vmcs12->guest_interruptibility_info; 1961 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1962 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1963 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1964 evmcs->vm_entry_exception_error_code = 1965 vmcs12->vm_entry_exception_error_code; 1966 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1967 1968 evmcs->guest_rip = vmcs12->guest_rip; 1969 1970 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1971 1972 return 0; 1973 } 1974 1975 /* 1976 * This is an equivalent of the nested hypervisor executing the vmptrld 1977 * instruction. 1978 */ 1979 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 1980 struct kvm_vcpu *vcpu, bool from_launch) 1981 { 1982 struct vcpu_vmx *vmx = to_vmx(vcpu); 1983 bool evmcs_gpa_changed = false; 1984 u64 evmcs_gpa; 1985 1986 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1987 return EVMPTRLD_DISABLED; 1988 1989 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) 1990 return EVMPTRLD_DISABLED; 1991 1992 if (unlikely(!vmx->nested.hv_evmcs || 1993 evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1994 if (!vmx->nested.hv_evmcs) 1995 vmx->nested.current_vmptr = -1ull; 1996 1997 nested_release_evmcs(vcpu); 1998 1999 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2000 &vmx->nested.hv_evmcs_map)) 2001 return EVMPTRLD_ERROR; 2002 2003 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2004 2005 /* 2006 * Currently, KVM only supports eVMCS version 1 2007 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2008 * value to first u32 field of eVMCS which should specify eVMCS 2009 * VersionNumber. 2010 * 2011 * Guest should be aware of supported eVMCS versions by host by 2012 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2013 * expected to set this CPUID leaf according to the value 2014 * returned in vmcs_version from nested_enable_evmcs(). 2015 * 2016 * However, it turns out that Microsoft Hyper-V fails to comply 2017 * to their own invented interface: When Hyper-V use eVMCS, it 2018 * just sets first u32 field of eVMCS to revision_id specified 2019 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2020 * which is one of the supported versions specified in 2021 * CPUID.0x4000000A.EAX[0:15]. 2022 * 2023 * To overcome Hyper-V bug, we accept here either a supported 2024 * eVMCS version or VMCS12 revision_id as valid values for first 2025 * u32 field of eVMCS. 2026 */ 2027 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2028 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2029 nested_release_evmcs(vcpu); 2030 return EVMPTRLD_VMFAIL; 2031 } 2032 2033 vmx->nested.dirty_vmcs12 = true; 2034 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2035 2036 evmcs_gpa_changed = true; 2037 /* 2038 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2039 * reloaded from guest's memory (read only fields, fields not 2040 * present in struct hv_enlightened_vmcs, ...). Make sure there 2041 * are no leftovers. 2042 */ 2043 if (from_launch) { 2044 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2045 memset(vmcs12, 0, sizeof(*vmcs12)); 2046 vmcs12->hdr.revision_id = VMCS12_REVISION; 2047 } 2048 2049 } 2050 2051 /* 2052 * Clean fields data can't be used on VMLAUNCH and when we switch 2053 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2054 */ 2055 if (from_launch || evmcs_gpa_changed) 2056 vmx->nested.hv_evmcs->hv_clean_fields &= 2057 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2058 2059 return EVMPTRLD_SUCCEEDED; 2060 } 2061 2062 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2063 { 2064 struct vcpu_vmx *vmx = to_vmx(vcpu); 2065 2066 if (vmx->nested.hv_evmcs) { 2067 copy_vmcs12_to_enlightened(vmx); 2068 /* All fields are clean */ 2069 vmx->nested.hv_evmcs->hv_clean_fields |= 2070 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2071 } else { 2072 copy_vmcs12_to_shadow(vmx); 2073 } 2074 2075 vmx->nested.need_vmcs12_to_shadow_sync = false; 2076 } 2077 2078 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2079 { 2080 struct vcpu_vmx *vmx = 2081 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2082 2083 vmx->nested.preemption_timer_expired = true; 2084 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2085 kvm_vcpu_kick(&vmx->vcpu); 2086 2087 return HRTIMER_NORESTART; 2088 } 2089 2090 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2091 { 2092 struct vcpu_vmx *vmx = to_vmx(vcpu); 2093 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2094 2095 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2096 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2097 2098 if (!vmx->nested.has_preemption_timer_deadline) { 2099 vmx->nested.preemption_timer_deadline = 2100 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2101 vmx->nested.has_preemption_timer_deadline = true; 2102 } 2103 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2104 } 2105 2106 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2107 u64 preemption_timeout) 2108 { 2109 struct vcpu_vmx *vmx = to_vmx(vcpu); 2110 2111 /* 2112 * A timer value of zero is architecturally guaranteed to cause 2113 * a VMExit prior to executing any instructions in the guest. 2114 */ 2115 if (preemption_timeout == 0) { 2116 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2117 return; 2118 } 2119 2120 if (vcpu->arch.virtual_tsc_khz == 0) 2121 return; 2122 2123 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2124 preemption_timeout *= 1000000; 2125 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2126 hrtimer_start(&vmx->nested.preemption_timer, 2127 ktime_add_ns(ktime_get(), preemption_timeout), 2128 HRTIMER_MODE_ABS_PINNED); 2129 } 2130 2131 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2132 { 2133 if (vmx->nested.nested_run_pending && 2134 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2135 return vmcs12->guest_ia32_efer; 2136 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2137 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2138 else 2139 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2140 } 2141 2142 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2143 { 2144 /* 2145 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2146 * according to L0's settings (vmcs12 is irrelevant here). Host 2147 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2148 * will be set as needed prior to VMLAUNCH/VMRESUME. 2149 */ 2150 if (vmx->nested.vmcs02_initialized) 2151 return; 2152 vmx->nested.vmcs02_initialized = true; 2153 2154 /* 2155 * We don't care what the EPTP value is we just need to guarantee 2156 * it's valid so we don't get a false positive when doing early 2157 * consistency checks. 2158 */ 2159 if (enable_ept && nested_early_check) 2160 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); 2161 2162 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2163 if (cpu_has_vmx_vmfunc()) 2164 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2165 2166 if (cpu_has_vmx_posted_intr()) 2167 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2168 2169 if (cpu_has_vmx_msr_bitmap()) 2170 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2171 2172 /* 2173 * The PML address never changes, so it is constant in vmcs02. 2174 * Conceptually we want to copy the PML index from vmcs01 here, 2175 * and then back to vmcs01 on nested vmexit. But since we flush 2176 * the log and reset GUEST_PML_INDEX on each vmexit, the PML 2177 * index is also effectively constant in vmcs02. 2178 */ 2179 if (enable_pml) { 2180 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 2181 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 2182 } 2183 2184 if (cpu_has_vmx_encls_vmexit()) 2185 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 2186 2187 /* 2188 * Set the MSR load/store lists to match L0's settings. Only the 2189 * addresses are constant (for vmcs02), the counts can change based 2190 * on L2's behavior, e.g. switching to/from long mode. 2191 */ 2192 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2193 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2194 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2195 2196 vmx_set_constant_host_state(vmx); 2197 } 2198 2199 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2200 struct vmcs12 *vmcs12) 2201 { 2202 prepare_vmcs02_constant_state(vmx); 2203 2204 vmcs_write64(VMCS_LINK_POINTER, -1ull); 2205 2206 if (enable_vpid) { 2207 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2208 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2209 else 2210 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2211 } 2212 } 2213 2214 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2215 { 2216 u32 exec_control, vmcs12_exec_ctrl; 2217 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2218 2219 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) 2220 prepare_vmcs02_early_rare(vmx, vmcs12); 2221 2222 /* 2223 * PIN CONTROLS 2224 */ 2225 exec_control = vmx_pin_based_exec_ctrl(vmx); 2226 exec_control |= (vmcs12->pin_based_vm_exec_control & 2227 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2228 2229 /* Posted interrupts setting is only taken from vmcs12. */ 2230 if (nested_cpu_has_posted_intr(vmcs12)) { 2231 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2232 vmx->nested.pi_pending = false; 2233 } else { 2234 exec_control &= ~PIN_BASED_POSTED_INTR; 2235 } 2236 pin_controls_set(vmx, exec_control); 2237 2238 /* 2239 * EXEC CONTROLS 2240 */ 2241 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2242 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2243 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2244 exec_control &= ~CPU_BASED_TPR_SHADOW; 2245 exec_control |= vmcs12->cpu_based_vm_exec_control; 2246 2247 vmx->nested.l1_tpr_threshold = -1; 2248 if (exec_control & CPU_BASED_TPR_SHADOW) 2249 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2250 #ifdef CONFIG_X86_64 2251 else 2252 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2253 CPU_BASED_CR8_STORE_EXITING; 2254 #endif 2255 2256 /* 2257 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2258 * for I/O port accesses. 2259 */ 2260 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2261 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2262 2263 /* 2264 * This bit will be computed in nested_get_vmcs12_pages, because 2265 * we do not have access to L1's MSR bitmap yet. For now, keep 2266 * the same bit as before, hoping to avoid multiple VMWRITEs that 2267 * only set/clear this bit. 2268 */ 2269 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2270 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2271 2272 exec_controls_set(vmx, exec_control); 2273 2274 /* 2275 * SECONDARY EXEC CONTROLS 2276 */ 2277 if (cpu_has_secondary_exec_ctrls()) { 2278 exec_control = vmx->secondary_exec_control; 2279 2280 /* Take the following fields only from vmcs12 */ 2281 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2282 SECONDARY_EXEC_ENABLE_INVPCID | 2283 SECONDARY_EXEC_RDTSCP | 2284 SECONDARY_EXEC_XSAVES | 2285 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2286 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2287 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2288 SECONDARY_EXEC_ENABLE_VMFUNC); 2289 if (nested_cpu_has(vmcs12, 2290 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { 2291 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & 2292 ~SECONDARY_EXEC_ENABLE_PML; 2293 exec_control |= vmcs12_exec_ctrl; 2294 } 2295 2296 /* VMCS shadowing for L2 is emulated for now */ 2297 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2298 2299 /* 2300 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2301 * will not have to rewrite the controls just for this bit. 2302 */ 2303 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2304 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2305 exec_control |= SECONDARY_EXEC_DESC; 2306 2307 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2308 vmcs_write16(GUEST_INTR_STATUS, 2309 vmcs12->guest_intr_status); 2310 2311 secondary_exec_controls_set(vmx, exec_control); 2312 } 2313 2314 /* 2315 * ENTRY CONTROLS 2316 * 2317 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2318 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2319 * on the related bits (if supported by the CPU) in the hope that 2320 * we can avoid VMWrites during vmx_set_efer(). 2321 */ 2322 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & 2323 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; 2324 if (cpu_has_load_ia32_efer()) { 2325 if (guest_efer & EFER_LMA) 2326 exec_control |= VM_ENTRY_IA32E_MODE; 2327 if (guest_efer != host_efer) 2328 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2329 } 2330 vm_entry_controls_set(vmx, exec_control); 2331 2332 /* 2333 * EXIT CONTROLS 2334 * 2335 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2336 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2337 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2338 */ 2339 exec_control = vmx_vmexit_ctrl(); 2340 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2341 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2342 vm_exit_controls_set(vmx, exec_control); 2343 2344 /* 2345 * Interrupt/Exception Fields 2346 */ 2347 if (vmx->nested.nested_run_pending) { 2348 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2349 vmcs12->vm_entry_intr_info_field); 2350 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2351 vmcs12->vm_entry_exception_error_code); 2352 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2353 vmcs12->vm_entry_instruction_len); 2354 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2355 vmcs12->guest_interruptibility_info); 2356 vmx->loaded_vmcs->nmi_known_unmasked = 2357 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2358 } else { 2359 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2360 } 2361 } 2362 2363 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2364 { 2365 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2366 2367 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2368 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2369 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2370 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2371 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2372 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2373 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2374 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2375 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2376 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2377 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2378 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2379 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2380 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2381 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2382 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2383 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2384 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2385 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2386 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2387 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2388 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2389 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2390 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2391 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2392 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2393 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2394 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2395 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2396 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2397 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2398 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2399 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2400 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2401 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2402 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2403 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2404 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2405 } 2406 2407 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2408 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2409 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2410 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2411 vmcs12->guest_pending_dbg_exceptions); 2412 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2413 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2414 2415 /* 2416 * L1 may access the L2's PDPTR, so save them to construct 2417 * vmcs12 2418 */ 2419 if (enable_ept) { 2420 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2421 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2422 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2423 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2424 } 2425 2426 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2427 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2428 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2429 } 2430 2431 if (nested_cpu_has_xsaves(vmcs12)) 2432 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2433 2434 /* 2435 * Whether page-faults are trapped is determined by a combination of 2436 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 2437 * If enable_ept, L0 doesn't care about page faults and we should 2438 * set all of these to L1's desires. However, if !enable_ept, L0 does 2439 * care about (at least some) page faults, and because it is not easy 2440 * (if at all possible?) to merge L0 and L1's desires, we simply ask 2441 * to exit on each and every L2 page fault. This is done by setting 2442 * MASK=MATCH=0 and (see below) EB.PF=1. 2443 * Note that below we don't need special code to set EB.PF beyond the 2444 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2445 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2446 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2447 */ 2448 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 2449 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 2450 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 2451 enable_ept ? vmcs12->page_fault_error_code_match : 0); 2452 2453 if (cpu_has_vmx_apicv()) { 2454 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2455 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2456 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2457 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2458 } 2459 2460 /* 2461 * Make sure the msr_autostore list is up to date before we set the 2462 * count in the vmcs02. 2463 */ 2464 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2465 2466 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2467 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2468 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2469 2470 set_cr4_guest_host_mask(vmx); 2471 } 2472 2473 /* 2474 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2475 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2476 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2477 * guest in a way that will both be appropriate to L1's requests, and our 2478 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2479 * function also has additional necessary side-effects, like setting various 2480 * vcpu->arch fields. 2481 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2482 * is assigned to entry_failure_code on failure. 2483 */ 2484 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2485 enum vm_entry_failure_code *entry_failure_code) 2486 { 2487 struct vcpu_vmx *vmx = to_vmx(vcpu); 2488 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2489 bool load_guest_pdptrs_vmcs12 = false; 2490 2491 if (vmx->nested.dirty_vmcs12 || hv_evmcs) { 2492 prepare_vmcs02_rare(vmx, vmcs12); 2493 vmx->nested.dirty_vmcs12 = false; 2494 2495 load_guest_pdptrs_vmcs12 = !hv_evmcs || 2496 !(hv_evmcs->hv_clean_fields & 2497 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2498 } 2499 2500 if (vmx->nested.nested_run_pending && 2501 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2502 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2503 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2504 } else { 2505 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2506 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2507 } 2508 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2509 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2510 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2511 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2512 2513 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2514 * bitwise-or of what L1 wants to trap for L2, and what we want to 2515 * trap. Note that CR0.TS also needs updating - we do this later. 2516 */ 2517 update_exception_bitmap(vcpu); 2518 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2519 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2520 2521 if (vmx->nested.nested_run_pending && 2522 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2523 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2524 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2525 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2526 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2527 } 2528 2529 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2530 2531 if (kvm_has_tsc_control) 2532 decache_tsc_multiplier(vmx); 2533 2534 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2535 2536 if (nested_cpu_has_ept(vmcs12)) 2537 nested_ept_init_mmu_context(vcpu); 2538 2539 /* 2540 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2541 * bits which we consider mandatory enabled. 2542 * The CR0_READ_SHADOW is what L2 should have expected to read given 2543 * the specifications by L1; It's not enough to take 2544 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2545 * have more bits than L1 expected. 2546 */ 2547 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2548 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2549 2550 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2551 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2552 2553 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2554 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2555 vmx_set_efer(vcpu, vcpu->arch.efer); 2556 2557 /* 2558 * Guest state is invalid and unrestricted guest is disabled, 2559 * which means L1 attempted VMEntry to L2 with invalid state. 2560 * Fail the VMEntry. 2561 */ 2562 if (vmx->emulation_required) { 2563 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2564 return -EINVAL; 2565 } 2566 2567 /* Shadow page tables on either EPT or shadow page tables. */ 2568 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2569 entry_failure_code)) 2570 return -EINVAL; 2571 2572 /* 2573 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2574 * on nested VM-Exit, which can occur without actually running L2 and 2575 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2576 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2577 * transition to HLT instead of running L2. 2578 */ 2579 if (enable_ept) 2580 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2581 2582 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2583 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2584 is_pae_paging(vcpu)) { 2585 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2586 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2587 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2588 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2589 } 2590 2591 if (!enable_ept) 2592 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2593 2594 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2595 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2596 vmcs12->guest_ia32_perf_global_ctrl))) 2597 return -EINVAL; 2598 2599 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2600 kvm_rip_write(vcpu, vmcs12->guest_rip); 2601 return 0; 2602 } 2603 2604 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2605 { 2606 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2607 nested_cpu_has_virtual_nmis(vmcs12))) 2608 return -EINVAL; 2609 2610 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2611 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2612 return -EINVAL; 2613 2614 return 0; 2615 } 2616 2617 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2618 { 2619 struct vcpu_vmx *vmx = to_vmx(vcpu); 2620 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2621 2622 /* Check for memory type validity */ 2623 switch (new_eptp & VMX_EPTP_MT_MASK) { 2624 case VMX_EPTP_MT_UC: 2625 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2626 return false; 2627 break; 2628 case VMX_EPTP_MT_WB: 2629 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2630 return false; 2631 break; 2632 default: 2633 return false; 2634 } 2635 2636 /* Page-walk levels validity. */ 2637 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2638 case VMX_EPTP_PWL_5: 2639 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2640 return false; 2641 break; 2642 case VMX_EPTP_PWL_4: 2643 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2644 return false; 2645 break; 2646 default: 2647 return false; 2648 } 2649 2650 /* Reserved bits should not be set */ 2651 if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f))) 2652 return false; 2653 2654 /* AD, if set, should be supported */ 2655 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2656 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2657 return false; 2658 } 2659 2660 return true; 2661 } 2662 2663 /* 2664 * Checks related to VM-Execution Control Fields 2665 */ 2666 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2667 struct vmcs12 *vmcs12) 2668 { 2669 struct vcpu_vmx *vmx = to_vmx(vcpu); 2670 2671 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2672 vmx->nested.msrs.pinbased_ctls_low, 2673 vmx->nested.msrs.pinbased_ctls_high)) || 2674 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2675 vmx->nested.msrs.procbased_ctls_low, 2676 vmx->nested.msrs.procbased_ctls_high))) 2677 return -EINVAL; 2678 2679 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2680 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2681 vmx->nested.msrs.secondary_ctls_low, 2682 vmx->nested.msrs.secondary_ctls_high))) 2683 return -EINVAL; 2684 2685 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2686 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2687 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2688 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2689 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2690 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2691 nested_vmx_check_nmi_controls(vmcs12) || 2692 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2693 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2694 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2695 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2696 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2697 return -EINVAL; 2698 2699 if (!nested_cpu_has_preemption_timer(vmcs12) && 2700 nested_cpu_has_save_preemption_timer(vmcs12)) 2701 return -EINVAL; 2702 2703 if (nested_cpu_has_ept(vmcs12) && 2704 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2705 return -EINVAL; 2706 2707 if (nested_cpu_has_vmfunc(vmcs12)) { 2708 if (CC(vmcs12->vm_function_control & 2709 ~vmx->nested.msrs.vmfunc_controls)) 2710 return -EINVAL; 2711 2712 if (nested_cpu_has_eptp_switching(vmcs12)) { 2713 if (CC(!nested_cpu_has_ept(vmcs12)) || 2714 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2715 return -EINVAL; 2716 } 2717 } 2718 2719 return 0; 2720 } 2721 2722 /* 2723 * Checks related to VM-Exit Control Fields 2724 */ 2725 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2726 struct vmcs12 *vmcs12) 2727 { 2728 struct vcpu_vmx *vmx = to_vmx(vcpu); 2729 2730 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2731 vmx->nested.msrs.exit_ctls_low, 2732 vmx->nested.msrs.exit_ctls_high)) || 2733 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2734 return -EINVAL; 2735 2736 return 0; 2737 } 2738 2739 /* 2740 * Checks related to VM-Entry Control Fields 2741 */ 2742 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2743 struct vmcs12 *vmcs12) 2744 { 2745 struct vcpu_vmx *vmx = to_vmx(vcpu); 2746 2747 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2748 vmx->nested.msrs.entry_ctls_low, 2749 vmx->nested.msrs.entry_ctls_high))) 2750 return -EINVAL; 2751 2752 /* 2753 * From the Intel SDM, volume 3: 2754 * Fields relevant to VM-entry event injection must be set properly. 2755 * These fields are the VM-entry interruption-information field, the 2756 * VM-entry exception error code, and the VM-entry instruction length. 2757 */ 2758 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2759 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2760 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2761 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2762 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2763 bool should_have_error_code; 2764 bool urg = nested_cpu_has2(vmcs12, 2765 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2766 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2767 2768 /* VM-entry interruption-info field: interruption type */ 2769 if (CC(intr_type == INTR_TYPE_RESERVED) || 2770 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2771 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2772 return -EINVAL; 2773 2774 /* VM-entry interruption-info field: vector */ 2775 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2776 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2777 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2778 return -EINVAL; 2779 2780 /* VM-entry interruption-info field: deliver error code */ 2781 should_have_error_code = 2782 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2783 x86_exception_has_error_code(vector); 2784 if (CC(has_error_code != should_have_error_code)) 2785 return -EINVAL; 2786 2787 /* VM-entry exception error code */ 2788 if (CC(has_error_code && 2789 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2790 return -EINVAL; 2791 2792 /* VM-entry interruption-info field: reserved bits */ 2793 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2794 return -EINVAL; 2795 2796 /* VM-entry instruction length */ 2797 switch (intr_type) { 2798 case INTR_TYPE_SOFT_EXCEPTION: 2799 case INTR_TYPE_SOFT_INTR: 2800 case INTR_TYPE_PRIV_SW_EXCEPTION: 2801 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2802 CC(vmcs12->vm_entry_instruction_len == 0 && 2803 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2804 return -EINVAL; 2805 } 2806 } 2807 2808 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2809 return -EINVAL; 2810 2811 return 0; 2812 } 2813 2814 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2815 struct vmcs12 *vmcs12) 2816 { 2817 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2818 nested_check_vm_exit_controls(vcpu, vmcs12) || 2819 nested_check_vm_entry_controls(vcpu, vmcs12)) 2820 return -EINVAL; 2821 2822 if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) 2823 return nested_evmcs_check_controls(vmcs12); 2824 2825 return 0; 2826 } 2827 2828 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2829 struct vmcs12 *vmcs12) 2830 { 2831 bool ia32e; 2832 2833 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2834 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2835 CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) 2836 return -EINVAL; 2837 2838 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2839 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2840 return -EINVAL; 2841 2842 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2843 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2844 return -EINVAL; 2845 2846 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2847 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2848 vmcs12->host_ia32_perf_global_ctrl))) 2849 return -EINVAL; 2850 2851 #ifdef CONFIG_X86_64 2852 ia32e = !!(vcpu->arch.efer & EFER_LMA); 2853 #else 2854 ia32e = false; 2855 #endif 2856 2857 if (ia32e) { 2858 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || 2859 CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2860 return -EINVAL; 2861 } else { 2862 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || 2863 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2864 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2865 CC((vmcs12->host_rip) >> 32)) 2866 return -EINVAL; 2867 } 2868 2869 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2870 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2871 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2872 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2873 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2874 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2875 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2876 CC(vmcs12->host_cs_selector == 0) || 2877 CC(vmcs12->host_tr_selector == 0) || 2878 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2879 return -EINVAL; 2880 2881 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2882 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2883 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2884 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2885 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2886 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2887 return -EINVAL; 2888 2889 /* 2890 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2891 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2892 * the values of the LMA and LME bits in the field must each be that of 2893 * the host address-space size VM-exit control. 2894 */ 2895 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2896 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2897 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2898 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2899 return -EINVAL; 2900 } 2901 2902 return 0; 2903 } 2904 2905 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2906 struct vmcs12 *vmcs12) 2907 { 2908 int r = 0; 2909 struct vmcs12 *shadow; 2910 struct kvm_host_map map; 2911 2912 if (vmcs12->vmcs_link_pointer == -1ull) 2913 return 0; 2914 2915 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2916 return -EINVAL; 2917 2918 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2919 return -EINVAL; 2920 2921 shadow = map.hva; 2922 2923 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2924 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2925 r = -EINVAL; 2926 2927 kvm_vcpu_unmap(vcpu, &map, false); 2928 return r; 2929 } 2930 2931 /* 2932 * Checks related to Guest Non-register State 2933 */ 2934 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2935 { 2936 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2937 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) 2938 return -EINVAL; 2939 2940 return 0; 2941 } 2942 2943 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2944 struct vmcs12 *vmcs12, 2945 enum vm_entry_failure_code *entry_failure_code) 2946 { 2947 bool ia32e; 2948 2949 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2950 2951 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2952 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2953 return -EINVAL; 2954 2955 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 2956 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 2957 return -EINVAL; 2958 2959 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2960 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2961 return -EINVAL; 2962 2963 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2964 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 2965 return -EINVAL; 2966 } 2967 2968 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2969 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2970 vmcs12->guest_ia32_perf_global_ctrl))) 2971 return -EINVAL; 2972 2973 /* 2974 * If the load IA32_EFER VM-entry control is 1, the following checks 2975 * are performed on the field for the IA32_EFER MSR: 2976 * - Bits reserved in the IA32_EFER MSR must be 0. 2977 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2978 * the IA-32e mode guest VM-exit control. It must also be identical 2979 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2980 * CR0.PG) is 1. 2981 */ 2982 if (to_vmx(vcpu)->nested.nested_run_pending && 2983 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2984 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2985 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2986 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2987 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2988 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2989 return -EINVAL; 2990 } 2991 2992 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2993 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 2994 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 2995 return -EINVAL; 2996 2997 if (nested_check_guest_non_reg_state(vmcs12)) 2998 return -EINVAL; 2999 3000 return 0; 3001 } 3002 3003 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3004 { 3005 struct vcpu_vmx *vmx = to_vmx(vcpu); 3006 unsigned long cr3, cr4; 3007 bool vm_fail; 3008 3009 if (!nested_early_check) 3010 return 0; 3011 3012 if (vmx->msr_autoload.host.nr) 3013 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3014 if (vmx->msr_autoload.guest.nr) 3015 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3016 3017 preempt_disable(); 3018 3019 vmx_prepare_switch_to_guest(vcpu); 3020 3021 /* 3022 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3023 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3024 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3025 * there is no need to preserve other bits or save/restore the field. 3026 */ 3027 vmcs_writel(GUEST_RFLAGS, 0); 3028 3029 cr3 = __get_current_cr3_fast(); 3030 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3031 vmcs_writel(HOST_CR3, cr3); 3032 vmx->loaded_vmcs->host_state.cr3 = cr3; 3033 } 3034 3035 cr4 = cr4_read_shadow(); 3036 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3037 vmcs_writel(HOST_CR4, cr4); 3038 vmx->loaded_vmcs->host_state.cr4 = cr4; 3039 } 3040 3041 asm( 3042 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ 3043 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 3044 "je 1f \n\t" 3045 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" 3046 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 3047 "1: \n\t" 3048 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ 3049 3050 /* Check if vmlaunch or vmresume is needed */ 3051 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" 3052 3053 /* 3054 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set 3055 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail 3056 * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the 3057 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. 3058 */ 3059 "call vmx_vmenter\n\t" 3060 3061 CC_SET(be) 3062 : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) 3063 : [HOST_RSP]"r"((unsigned long)HOST_RSP), 3064 [loaded_vmcs]"r"(vmx->loaded_vmcs), 3065 [launched]"i"(offsetof(struct loaded_vmcs, launched)), 3066 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), 3067 [wordsize]"i"(sizeof(ulong)) 3068 : "memory" 3069 ); 3070 3071 if (vmx->msr_autoload.host.nr) 3072 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3073 if (vmx->msr_autoload.guest.nr) 3074 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3075 3076 if (vm_fail) { 3077 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3078 3079 preempt_enable(); 3080 3081 trace_kvm_nested_vmenter_failed( 3082 "early hardware check VM-instruction error: ", error); 3083 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3084 return 1; 3085 } 3086 3087 /* 3088 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3089 */ 3090 if (hw_breakpoint_active()) 3091 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3092 local_irq_enable(); 3093 preempt_enable(); 3094 3095 /* 3096 * A non-failing VMEntry means we somehow entered guest mode with 3097 * an illegal RIP, and that's just the tip of the iceberg. There 3098 * is no telling what memory has been modified or what state has 3099 * been exposed to unknown code. Hitting this all but guarantees 3100 * a (very critical) hardware issue. 3101 */ 3102 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3103 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3104 3105 return 0; 3106 } 3107 3108 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3109 { 3110 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3111 struct vcpu_vmx *vmx = to_vmx(vcpu); 3112 struct kvm_host_map *map; 3113 struct page *page; 3114 u64 hpa; 3115 3116 /* 3117 * hv_evmcs may end up being not mapped after migration (when 3118 * L2 was running), map it here to make sure vmcs12 changes are 3119 * properly reflected. 3120 */ 3121 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) { 3122 enum nested_evmptrld_status evmptrld_status = 3123 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3124 3125 if (evmptrld_status == EVMPTRLD_VMFAIL || 3126 evmptrld_status == EVMPTRLD_ERROR) { 3127 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3128 __func__); 3129 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3130 vcpu->run->internal.suberror = 3131 KVM_INTERNAL_ERROR_EMULATION; 3132 vcpu->run->internal.ndata = 0; 3133 return false; 3134 } 3135 } 3136 3137 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3138 /* 3139 * Translate L1 physical address to host physical 3140 * address for vmcs02. Keep the page pinned, so this 3141 * physical address remains valid. We keep a reference 3142 * to it so we can release it later. 3143 */ 3144 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 3145 kvm_release_page_clean(vmx->nested.apic_access_page); 3146 vmx->nested.apic_access_page = NULL; 3147 } 3148 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 3149 if (!is_error_page(page)) { 3150 vmx->nested.apic_access_page = page; 3151 hpa = page_to_phys(vmx->nested.apic_access_page); 3152 vmcs_write64(APIC_ACCESS_ADDR, hpa); 3153 } else { 3154 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 3155 __func__); 3156 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3157 vcpu->run->internal.suberror = 3158 KVM_INTERNAL_ERROR_EMULATION; 3159 vcpu->run->internal.ndata = 0; 3160 return false; 3161 } 3162 } 3163 3164 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3165 map = &vmx->nested.virtual_apic_map; 3166 3167 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3168 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3169 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3170 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3171 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3172 /* 3173 * The processor will never use the TPR shadow, simply 3174 * clear the bit from the execution control. Such a 3175 * configuration is useless, but it happens in tests. 3176 * For any other configuration, failing the vm entry is 3177 * _not_ what the processor does but it's basically the 3178 * only possibility we have. 3179 */ 3180 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3181 } else { 3182 /* 3183 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3184 * force VM-Entry to fail. 3185 */ 3186 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 3187 } 3188 } 3189 3190 if (nested_cpu_has_posted_intr(vmcs12)) { 3191 map = &vmx->nested.pi_desc_map; 3192 3193 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3194 vmx->nested.pi_desc = 3195 (struct pi_desc *)(((void *)map->hva) + 3196 offset_in_page(vmcs12->posted_intr_desc_addr)); 3197 vmcs_write64(POSTED_INTR_DESC_ADDR, 3198 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3199 } 3200 } 3201 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3202 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3203 else 3204 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3205 return true; 3206 } 3207 3208 /* 3209 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3210 * for running VMX instructions (except VMXON, whose prerequisites are 3211 * slightly different). It also specifies what exception to inject otherwise. 3212 * Note that many of these exceptions have priority over VM exits, so they 3213 * don't have to be checked again here. 3214 */ 3215 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3216 { 3217 if (!to_vmx(vcpu)->nested.vmxon) { 3218 kvm_queue_exception(vcpu, UD_VECTOR); 3219 return 0; 3220 } 3221 3222 if (vmx_get_cpl(vcpu)) { 3223 kvm_inject_gp(vcpu, 0); 3224 return 0; 3225 } 3226 3227 return 1; 3228 } 3229 3230 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3231 { 3232 u8 rvi = vmx_get_rvi(); 3233 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3234 3235 return ((rvi & 0xf0) > (vppr & 0xf0)); 3236 } 3237 3238 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3239 struct vmcs12 *vmcs12); 3240 3241 /* 3242 * If from_vmentry is false, this is being called from state restore (either RSM 3243 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3244 * 3245 * Returns: 3246 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3247 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3248 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3249 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3250 */ 3251 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3252 bool from_vmentry) 3253 { 3254 struct vcpu_vmx *vmx = to_vmx(vcpu); 3255 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3256 enum vm_entry_failure_code entry_failure_code; 3257 bool evaluate_pending_interrupts; 3258 u32 exit_reason, failed_index; 3259 3260 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 3261 kvm_vcpu_flush_tlb_current(vcpu); 3262 3263 evaluate_pending_interrupts = exec_controls_get(vmx) & 3264 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3265 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3266 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3267 3268 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3269 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3270 if (kvm_mpx_supported() && 3271 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3272 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3273 3274 /* 3275 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3276 * nested early checks are disabled. In the event of a "late" VM-Fail, 3277 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3278 * software model to the pre-VMEntry host state. When EPT is disabled, 3279 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3280 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3281 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3282 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3283 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3284 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3285 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3286 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3287 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3288 * path would need to manually save/restore vmcs01.GUEST_CR3. 3289 */ 3290 if (!enable_ept && !nested_early_check) 3291 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3292 3293 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3294 3295 prepare_vmcs02_early(vmx, vmcs12); 3296 3297 if (from_vmentry) { 3298 if (unlikely(!nested_get_vmcs12_pages(vcpu))) 3299 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3300 3301 if (nested_vmx_check_vmentry_hw(vcpu)) { 3302 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3303 return NVMX_VMENTRY_VMFAIL; 3304 } 3305 3306 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3307 &entry_failure_code)) { 3308 exit_reason = EXIT_REASON_INVALID_STATE; 3309 vmcs12->exit_qualification = entry_failure_code; 3310 goto vmentry_fail_vmexit; 3311 } 3312 } 3313 3314 enter_guest_mode(vcpu); 3315 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3316 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 3317 3318 if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { 3319 exit_reason = EXIT_REASON_INVALID_STATE; 3320 vmcs12->exit_qualification = entry_failure_code; 3321 goto vmentry_fail_vmexit_guest_mode; 3322 } 3323 3324 if (from_vmentry) { 3325 failed_index = nested_vmx_load_msr(vcpu, 3326 vmcs12->vm_entry_msr_load_addr, 3327 vmcs12->vm_entry_msr_load_count); 3328 if (failed_index) { 3329 exit_reason = EXIT_REASON_MSR_LOAD_FAIL; 3330 vmcs12->exit_qualification = failed_index; 3331 goto vmentry_fail_vmexit_guest_mode; 3332 } 3333 } else { 3334 /* 3335 * The MMU is not initialized to point at the right entities yet and 3336 * "get pages" would need to read data from the guest (i.e. we will 3337 * need to perform gpa to hpa translation). Request a call 3338 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3339 * have already been set at vmentry time and should not be reset. 3340 */ 3341 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 3342 } 3343 3344 /* 3345 * If L1 had a pending IRQ/NMI until it executed 3346 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3347 * disallowed (e.g. interrupts disabled), L0 needs to 3348 * evaluate if this pending event should cause an exit from L2 3349 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3350 * intercept EXTERNAL_INTERRUPT). 3351 * 3352 * Usually this would be handled by the processor noticing an 3353 * IRQ/NMI window request, or checking RVI during evaluation of 3354 * pending virtual interrupts. However, this setting was done 3355 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3356 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3357 */ 3358 if (unlikely(evaluate_pending_interrupts)) 3359 kvm_make_request(KVM_REQ_EVENT, vcpu); 3360 3361 /* 3362 * Do not start the preemption timer hrtimer until after we know 3363 * we are successful, so that only nested_vmx_vmexit needs to cancel 3364 * the timer. 3365 */ 3366 vmx->nested.preemption_timer_expired = false; 3367 if (nested_cpu_has_preemption_timer(vmcs12)) { 3368 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3369 vmx_start_preemption_timer(vcpu, timer_value); 3370 } 3371 3372 /* 3373 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3374 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3375 * returned as far as L1 is concerned. It will only return (and set 3376 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3377 */ 3378 return NVMX_VMENTRY_SUCCESS; 3379 3380 /* 3381 * A failed consistency check that leads to a VMExit during L1's 3382 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3383 * 26.7 "VM-entry failures during or after loading guest state". 3384 */ 3385 vmentry_fail_vmexit_guest_mode: 3386 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3387 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3388 leave_guest_mode(vcpu); 3389 3390 vmentry_fail_vmexit: 3391 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3392 3393 if (!from_vmentry) 3394 return NVMX_VMENTRY_VMEXIT; 3395 3396 load_vmcs12_host_state(vcpu, vmcs12); 3397 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 3398 if (enable_shadow_vmcs || vmx->nested.hv_evmcs) 3399 vmx->nested.need_vmcs12_to_shadow_sync = true; 3400 return NVMX_VMENTRY_VMEXIT; 3401 } 3402 3403 /* 3404 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3405 * for running an L2 nested guest. 3406 */ 3407 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3408 { 3409 struct vmcs12 *vmcs12; 3410 enum nvmx_vmentry_status status; 3411 struct vcpu_vmx *vmx = to_vmx(vcpu); 3412 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3413 enum nested_evmptrld_status evmptrld_status; 3414 3415 if (!nested_vmx_check_permission(vcpu)) 3416 return 1; 3417 3418 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3419 if (evmptrld_status == EVMPTRLD_ERROR) { 3420 kvm_queue_exception(vcpu, UD_VECTOR); 3421 return 1; 3422 } else if (evmptrld_status == EVMPTRLD_VMFAIL) { 3423 return nested_vmx_failInvalid(vcpu); 3424 } 3425 3426 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) 3427 return nested_vmx_failInvalid(vcpu); 3428 3429 vmcs12 = get_vmcs12(vcpu); 3430 3431 /* 3432 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3433 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3434 * rather than RFLAGS.ZF, and no error number is stored to the 3435 * VM-instruction error field. 3436 */ 3437 if (vmcs12->hdr.shadow_vmcs) 3438 return nested_vmx_failInvalid(vcpu); 3439 3440 if (vmx->nested.hv_evmcs) { 3441 copy_enlightened_to_vmcs12(vmx); 3442 /* Enlightened VMCS doesn't have launch state */ 3443 vmcs12->launch_state = !launch; 3444 } else if (enable_shadow_vmcs) { 3445 copy_shadow_to_vmcs12(vmx); 3446 } 3447 3448 /* 3449 * The nested entry process starts with enforcing various prerequisites 3450 * on vmcs12 as required by the Intel SDM, and act appropriately when 3451 * they fail: As the SDM explains, some conditions should cause the 3452 * instruction to fail, while others will cause the instruction to seem 3453 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3454 * To speed up the normal (success) code path, we should avoid checking 3455 * for misconfigurations which will anyway be caught by the processor 3456 * when using the merged vmcs02. 3457 */ 3458 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) 3459 return nested_vmx_failValid(vcpu, 3460 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3461 3462 if (vmcs12->launch_state == launch) 3463 return nested_vmx_failValid(vcpu, 3464 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3465 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3466 3467 if (nested_vmx_check_controls(vcpu, vmcs12)) 3468 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3469 3470 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3471 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3472 3473 /* 3474 * We're finally done with prerequisite checking, and can start with 3475 * the nested entry. 3476 */ 3477 vmx->nested.nested_run_pending = 1; 3478 vmx->nested.has_preemption_timer_deadline = false; 3479 status = nested_vmx_enter_non_root_mode(vcpu, true); 3480 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3481 goto vmentry_failed; 3482 3483 /* Hide L1D cache contents from the nested guest. */ 3484 vmx->vcpu.arch.l1tf_flush_l1d = true; 3485 3486 /* 3487 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3488 * also be used as part of restoring nVMX state for 3489 * snapshot restore (migration). 3490 * 3491 * In this flow, it is assumed that vmcs12 cache was 3492 * trasferred as part of captured nVMX state and should 3493 * therefore not be read from guest memory (which may not 3494 * exist on destination host yet). 3495 */ 3496 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3497 3498 /* 3499 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3500 * awakened by event injection or by an NMI-window VM-exit or 3501 * by an interrupt-window VM-exit, halt the vcpu. 3502 */ 3503 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && 3504 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3505 !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) && 3506 !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) && 3507 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3508 vmx->nested.nested_run_pending = 0; 3509 return kvm_vcpu_halt(vcpu); 3510 } 3511 return 1; 3512 3513 vmentry_failed: 3514 vmx->nested.nested_run_pending = 0; 3515 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3516 return 0; 3517 if (status == NVMX_VMENTRY_VMEXIT) 3518 return 1; 3519 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3520 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3521 } 3522 3523 /* 3524 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3525 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3526 * This function returns the new value we should put in vmcs12.guest_cr0. 3527 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3528 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3529 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3530 * didn't trap the bit, because if L1 did, so would L0). 3531 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3532 * been modified by L2, and L1 knows it. So just leave the old value of 3533 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3534 * isn't relevant, because if L0 traps this bit it can set it to anything. 3535 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3536 * changed these bits, and therefore they need to be updated, but L0 3537 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3538 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3539 */ 3540 static inline unsigned long 3541 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3542 { 3543 return 3544 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3545 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3546 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3547 vcpu->arch.cr0_guest_owned_bits)); 3548 } 3549 3550 static inline unsigned long 3551 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3552 { 3553 return 3554 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3555 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3556 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3557 vcpu->arch.cr4_guest_owned_bits)); 3558 } 3559 3560 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3561 struct vmcs12 *vmcs12) 3562 { 3563 u32 idt_vectoring; 3564 unsigned int nr; 3565 3566 if (vcpu->arch.exception.injected) { 3567 nr = vcpu->arch.exception.nr; 3568 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3569 3570 if (kvm_exception_is_soft(nr)) { 3571 vmcs12->vm_exit_instruction_len = 3572 vcpu->arch.event_exit_inst_len; 3573 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3574 } else 3575 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3576 3577 if (vcpu->arch.exception.has_error_code) { 3578 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3579 vmcs12->idt_vectoring_error_code = 3580 vcpu->arch.exception.error_code; 3581 } 3582 3583 vmcs12->idt_vectoring_info_field = idt_vectoring; 3584 } else if (vcpu->arch.nmi_injected) { 3585 vmcs12->idt_vectoring_info_field = 3586 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3587 } else if (vcpu->arch.interrupt.injected) { 3588 nr = vcpu->arch.interrupt.nr; 3589 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3590 3591 if (vcpu->arch.interrupt.soft) { 3592 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3593 vmcs12->vm_entry_instruction_len = 3594 vcpu->arch.event_exit_inst_len; 3595 } else 3596 idt_vectoring |= INTR_TYPE_EXT_INTR; 3597 3598 vmcs12->idt_vectoring_info_field = idt_vectoring; 3599 } 3600 } 3601 3602 3603 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3604 { 3605 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3606 gfn_t gfn; 3607 3608 /* 3609 * Don't need to mark the APIC access page dirty; it is never 3610 * written to by the CPU during APIC virtualization. 3611 */ 3612 3613 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3614 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3615 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3616 } 3617 3618 if (nested_cpu_has_posted_intr(vmcs12)) { 3619 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3620 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3621 } 3622 } 3623 3624 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3625 { 3626 struct vcpu_vmx *vmx = to_vmx(vcpu); 3627 int max_irr; 3628 void *vapic_page; 3629 u16 status; 3630 3631 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) 3632 return; 3633 3634 vmx->nested.pi_pending = false; 3635 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3636 return; 3637 3638 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3639 if (max_irr != 256) { 3640 vapic_page = vmx->nested.virtual_apic_map.hva; 3641 if (!vapic_page) 3642 return; 3643 3644 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3645 vapic_page, &max_irr); 3646 status = vmcs_read16(GUEST_INTR_STATUS); 3647 if ((u8)max_irr > ((u8)status & 0xff)) { 3648 status &= ~0xff; 3649 status |= (u8)max_irr; 3650 vmcs_write16(GUEST_INTR_STATUS, status); 3651 } 3652 } 3653 3654 nested_mark_vmcs12_pages_dirty(vcpu); 3655 } 3656 3657 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3658 unsigned long exit_qual) 3659 { 3660 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3661 unsigned int nr = vcpu->arch.exception.nr; 3662 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3663 3664 if (vcpu->arch.exception.has_error_code) { 3665 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3666 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3667 } 3668 3669 if (kvm_exception_is_soft(nr)) 3670 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3671 else 3672 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3673 3674 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3675 vmx_get_nmi_mask(vcpu)) 3676 intr_info |= INTR_INFO_UNBLOCK_NMI; 3677 3678 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3679 } 3680 3681 /* 3682 * Returns true if a debug trap is pending delivery. 3683 * 3684 * In KVM, debug traps bear an exception payload. As such, the class of a #DB 3685 * exception may be inferred from the presence of an exception payload. 3686 */ 3687 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu) 3688 { 3689 return vcpu->arch.exception.pending && 3690 vcpu->arch.exception.nr == DB_VECTOR && 3691 vcpu->arch.exception.payload; 3692 } 3693 3694 /* 3695 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3696 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3697 * represents these debug traps with a payload that is said to be compatible 3698 * with the 'pending debug exceptions' field, write the payload to the VMCS 3699 * field if a VM-exit is delivered before the debug trap. 3700 */ 3701 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3702 { 3703 if (vmx_pending_dbg_trap(vcpu)) 3704 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 3705 vcpu->arch.exception.payload); 3706 } 3707 3708 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3709 { 3710 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3711 to_vmx(vcpu)->nested.preemption_timer_expired; 3712 } 3713 3714 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 3715 { 3716 struct vcpu_vmx *vmx = to_vmx(vcpu); 3717 unsigned long exit_qual; 3718 bool block_nested_events = 3719 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3720 bool mtf_pending = vmx->nested.mtf_pending; 3721 struct kvm_lapic *apic = vcpu->arch.apic; 3722 3723 /* 3724 * Clear the MTF state. If a higher priority VM-exit is delivered first, 3725 * this state is discarded. 3726 */ 3727 if (!block_nested_events) 3728 vmx->nested.mtf_pending = false; 3729 3730 if (lapic_in_kernel(vcpu) && 3731 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3732 if (block_nested_events) 3733 return -EBUSY; 3734 nested_vmx_update_pending_dbg(vcpu); 3735 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3736 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3737 return 0; 3738 } 3739 3740 /* 3741 * Process any exceptions that are not debug traps before MTF. 3742 */ 3743 if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) { 3744 if (block_nested_events) 3745 return -EBUSY; 3746 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3747 goto no_vmexit; 3748 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3749 return 0; 3750 } 3751 3752 if (mtf_pending) { 3753 if (block_nested_events) 3754 return -EBUSY; 3755 nested_vmx_update_pending_dbg(vcpu); 3756 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 3757 return 0; 3758 } 3759 3760 if (vcpu->arch.exception.pending) { 3761 if (block_nested_events) 3762 return -EBUSY; 3763 if (!nested_vmx_check_exception(vcpu, &exit_qual)) 3764 goto no_vmexit; 3765 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3766 return 0; 3767 } 3768 3769 if (nested_vmx_preemption_timer_pending(vcpu)) { 3770 if (block_nested_events) 3771 return -EBUSY; 3772 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3773 return 0; 3774 } 3775 3776 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 3777 if (block_nested_events) 3778 return -EBUSY; 3779 goto no_vmexit; 3780 } 3781 3782 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 3783 if (block_nested_events) 3784 return -EBUSY; 3785 if (!nested_exit_on_nmi(vcpu)) 3786 goto no_vmexit; 3787 3788 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3789 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3790 INTR_INFO_VALID_MASK, 0); 3791 /* 3792 * The NMI-triggered VM exit counts as injection: 3793 * clear this one and block further NMIs. 3794 */ 3795 vcpu->arch.nmi_pending = 0; 3796 vmx_set_nmi_mask(vcpu, true); 3797 return 0; 3798 } 3799 3800 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 3801 if (block_nested_events) 3802 return -EBUSY; 3803 if (!nested_exit_on_intr(vcpu)) 3804 goto no_vmexit; 3805 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3806 return 0; 3807 } 3808 3809 no_vmexit: 3810 vmx_complete_nested_posted_interrupt(vcpu); 3811 return 0; 3812 } 3813 3814 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3815 { 3816 ktime_t remaining = 3817 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3818 u64 value; 3819 3820 if (ktime_to_ns(remaining) <= 0) 3821 return 0; 3822 3823 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3824 do_div(value, 1000000); 3825 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3826 } 3827 3828 static bool is_vmcs12_ext_field(unsigned long field) 3829 { 3830 switch (field) { 3831 case GUEST_ES_SELECTOR: 3832 case GUEST_CS_SELECTOR: 3833 case GUEST_SS_SELECTOR: 3834 case GUEST_DS_SELECTOR: 3835 case GUEST_FS_SELECTOR: 3836 case GUEST_GS_SELECTOR: 3837 case GUEST_LDTR_SELECTOR: 3838 case GUEST_TR_SELECTOR: 3839 case GUEST_ES_LIMIT: 3840 case GUEST_CS_LIMIT: 3841 case GUEST_SS_LIMIT: 3842 case GUEST_DS_LIMIT: 3843 case GUEST_FS_LIMIT: 3844 case GUEST_GS_LIMIT: 3845 case GUEST_LDTR_LIMIT: 3846 case GUEST_TR_LIMIT: 3847 case GUEST_GDTR_LIMIT: 3848 case GUEST_IDTR_LIMIT: 3849 case GUEST_ES_AR_BYTES: 3850 case GUEST_DS_AR_BYTES: 3851 case GUEST_FS_AR_BYTES: 3852 case GUEST_GS_AR_BYTES: 3853 case GUEST_LDTR_AR_BYTES: 3854 case GUEST_TR_AR_BYTES: 3855 case GUEST_ES_BASE: 3856 case GUEST_CS_BASE: 3857 case GUEST_SS_BASE: 3858 case GUEST_DS_BASE: 3859 case GUEST_FS_BASE: 3860 case GUEST_GS_BASE: 3861 case GUEST_LDTR_BASE: 3862 case GUEST_TR_BASE: 3863 case GUEST_GDTR_BASE: 3864 case GUEST_IDTR_BASE: 3865 case GUEST_PENDING_DBG_EXCEPTIONS: 3866 case GUEST_BNDCFGS: 3867 return true; 3868 default: 3869 break; 3870 } 3871 3872 return false; 3873 } 3874 3875 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3876 struct vmcs12 *vmcs12) 3877 { 3878 struct vcpu_vmx *vmx = to_vmx(vcpu); 3879 3880 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3881 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3882 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3883 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3884 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3885 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3886 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 3887 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 3888 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 3889 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 3890 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 3891 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 3892 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 3893 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 3894 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 3895 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 3896 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3897 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3898 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3899 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 3900 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 3901 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 3902 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 3903 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 3904 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 3905 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 3906 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 3907 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 3908 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 3909 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 3910 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 3911 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 3912 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 3913 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 3914 vmcs12->guest_pending_dbg_exceptions = 3915 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 3916 if (kvm_mpx_supported()) 3917 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3918 3919 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 3920 } 3921 3922 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3923 struct vmcs12 *vmcs12) 3924 { 3925 struct vcpu_vmx *vmx = to_vmx(vcpu); 3926 int cpu; 3927 3928 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 3929 return; 3930 3931 3932 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 3933 3934 cpu = get_cpu(); 3935 vmx->loaded_vmcs = &vmx->nested.vmcs02; 3936 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 3937 3938 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3939 3940 vmx->loaded_vmcs = &vmx->vmcs01; 3941 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 3942 put_cpu(); 3943 } 3944 3945 /* 3946 * Update the guest state fields of vmcs12 to reflect changes that 3947 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 3948 * VM-entry controls is also updated, since this is really a guest 3949 * state bit.) 3950 */ 3951 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3952 { 3953 struct vcpu_vmx *vmx = to_vmx(vcpu); 3954 3955 if (vmx->nested.hv_evmcs) 3956 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3957 3958 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; 3959 3960 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 3961 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 3962 3963 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 3964 vmcs12->guest_rip = kvm_rip_read(vcpu); 3965 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 3966 3967 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 3968 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 3969 3970 vmcs12->guest_interruptibility_info = 3971 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 3972 3973 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3974 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 3975 else 3976 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 3977 3978 if (nested_cpu_has_preemption_timer(vmcs12) && 3979 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 3980 !vmx->nested.nested_run_pending) 3981 vmcs12->vmx_preemption_timer_value = 3982 vmx_get_preemption_timer_value(vcpu); 3983 3984 /* 3985 * In some cases (usually, nested EPT), L2 is allowed to change its 3986 * own CR3 without exiting. If it has changed it, we must keep it. 3987 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 3988 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 3989 * 3990 * Additionally, restore L2's PDPTR to vmcs12. 3991 */ 3992 if (enable_ept) { 3993 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 3994 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3995 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 3996 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 3997 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 3998 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 3999 } 4000 } 4001 4002 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4003 4004 if (nested_cpu_has_vid(vmcs12)) 4005 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4006 4007 vmcs12->vm_entry_controls = 4008 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4009 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4010 4011 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4012 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4013 4014 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4015 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4016 } 4017 4018 /* 4019 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4020 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4021 * and this function updates it to reflect the changes to the guest state while 4022 * L2 was running (and perhaps made some exits which were handled directly by L0 4023 * without going back to L1), and to reflect the exit reason. 4024 * Note that we do not have to copy here all VMCS fields, just those that 4025 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4026 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4027 * which already writes to vmcs12 directly. 4028 */ 4029 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4030 u32 vm_exit_reason, u32 exit_intr_info, 4031 unsigned long exit_qualification) 4032 { 4033 /* update exit information fields: */ 4034 vmcs12->vm_exit_reason = vm_exit_reason; 4035 vmcs12->exit_qualification = exit_qualification; 4036 vmcs12->vm_exit_intr_info = exit_intr_info; 4037 4038 vmcs12->idt_vectoring_info_field = 0; 4039 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4040 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4041 4042 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4043 vmcs12->launch_state = 1; 4044 4045 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4046 * instead of reading the real value. */ 4047 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4048 4049 /* 4050 * Transfer the event that L0 or L1 may wanted to inject into 4051 * L2 to IDT_VECTORING_INFO_FIELD. 4052 */ 4053 vmcs12_save_pending_event(vcpu, vmcs12); 4054 4055 /* 4056 * According to spec, there's no need to store the guest's 4057 * MSRs if the exit is due to a VM-entry failure that occurs 4058 * during or after loading the guest state. Since this exit 4059 * does not fall in that category, we need to save the MSRs. 4060 */ 4061 if (nested_vmx_store_msr(vcpu, 4062 vmcs12->vm_exit_msr_store_addr, 4063 vmcs12->vm_exit_msr_store_count)) 4064 nested_vmx_abort(vcpu, 4065 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4066 } 4067 4068 /* 4069 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 4070 * preserved above and would only end up incorrectly in L1. 4071 */ 4072 vcpu->arch.nmi_injected = false; 4073 kvm_clear_exception_queue(vcpu); 4074 kvm_clear_interrupt_queue(vcpu); 4075 } 4076 4077 /* 4078 * A part of what we need to when the nested L2 guest exits and we want to 4079 * run its L1 parent, is to reset L1's guest state to the host state specified 4080 * in vmcs12. 4081 * This function is to be called not only on normal nested exit, but also on 4082 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4083 * Failures During or After Loading Guest State"). 4084 * This function should be called when the active VMCS is L1's (vmcs01). 4085 */ 4086 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4087 struct vmcs12 *vmcs12) 4088 { 4089 enum vm_entry_failure_code ignored; 4090 struct kvm_segment seg; 4091 4092 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4093 vcpu->arch.efer = vmcs12->host_ia32_efer; 4094 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4095 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4096 else 4097 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4098 vmx_set_efer(vcpu, vcpu->arch.efer); 4099 4100 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4101 kvm_rip_write(vcpu, vmcs12->host_rip); 4102 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4103 vmx_set_interrupt_shadow(vcpu, 0); 4104 4105 /* 4106 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4107 * actually changed, because vmx_set_cr0 refers to efer set above. 4108 * 4109 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4110 * (KVM doesn't change it); 4111 */ 4112 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 4113 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4114 4115 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4116 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4117 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4118 4119 nested_ept_uninit_mmu_context(vcpu); 4120 4121 /* 4122 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4123 * couldn't have changed. 4124 */ 4125 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) 4126 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4127 4128 if (!enable_ept) 4129 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 4130 4131 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4132 4133 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4134 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4135 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4136 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4137 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4138 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4139 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4140 4141 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4142 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4143 vmcs_write64(GUEST_BNDCFGS, 0); 4144 4145 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4146 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4147 vcpu->arch.pat = vmcs12->host_ia32_pat; 4148 } 4149 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 4150 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4151 vmcs12->host_ia32_perf_global_ctrl)); 4152 4153 /* Set L1 segment info according to Intel SDM 4154 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4155 seg = (struct kvm_segment) { 4156 .base = 0, 4157 .limit = 0xFFFFFFFF, 4158 .selector = vmcs12->host_cs_selector, 4159 .type = 11, 4160 .present = 1, 4161 .s = 1, 4162 .g = 1 4163 }; 4164 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4165 seg.l = 1; 4166 else 4167 seg.db = 1; 4168 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4169 seg = (struct kvm_segment) { 4170 .base = 0, 4171 .limit = 0xFFFFFFFF, 4172 .type = 3, 4173 .present = 1, 4174 .s = 1, 4175 .db = 1, 4176 .g = 1 4177 }; 4178 seg.selector = vmcs12->host_ds_selector; 4179 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4180 seg.selector = vmcs12->host_es_selector; 4181 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4182 seg.selector = vmcs12->host_ss_selector; 4183 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4184 seg.selector = vmcs12->host_fs_selector; 4185 seg.base = vmcs12->host_fs_base; 4186 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4187 seg.selector = vmcs12->host_gs_selector; 4188 seg.base = vmcs12->host_gs_base; 4189 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4190 seg = (struct kvm_segment) { 4191 .base = vmcs12->host_tr_base, 4192 .limit = 0x67, 4193 .selector = vmcs12->host_tr_selector, 4194 .type = 11, 4195 .present = 1 4196 }; 4197 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4198 4199 kvm_set_dr(vcpu, 7, 0x400); 4200 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4201 4202 if (cpu_has_vmx_msr_bitmap()) 4203 vmx_update_msr_bitmap(vcpu); 4204 4205 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4206 vmcs12->vm_exit_msr_load_count)) 4207 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4208 } 4209 4210 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4211 { 4212 struct shared_msr_entry *efer_msr; 4213 unsigned int i; 4214 4215 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4216 return vmcs_read64(GUEST_IA32_EFER); 4217 4218 if (cpu_has_load_ia32_efer()) 4219 return host_efer; 4220 4221 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4222 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4223 return vmx->msr_autoload.guest.val[i].value; 4224 } 4225 4226 efer_msr = find_msr_entry(vmx, MSR_EFER); 4227 if (efer_msr) 4228 return efer_msr->data; 4229 4230 return host_efer; 4231 } 4232 4233 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4234 { 4235 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4236 struct vcpu_vmx *vmx = to_vmx(vcpu); 4237 struct vmx_msr_entry g, h; 4238 gpa_t gpa; 4239 u32 i, j; 4240 4241 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4242 4243 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4244 /* 4245 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4246 * as vmcs01.GUEST_DR7 contains a userspace defined value 4247 * and vcpu->arch.dr7 is not squirreled away before the 4248 * nested VMENTER (not worth adding a variable in nested_vmx). 4249 */ 4250 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4251 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4252 else 4253 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4254 } 4255 4256 /* 4257 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4258 * handle a variety of side effects to KVM's software model. 4259 */ 4260 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4261 4262 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 4263 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4264 4265 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4266 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4267 4268 nested_ept_uninit_mmu_context(vcpu); 4269 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4270 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4271 4272 /* 4273 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4274 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4275 * VMFail, like everything else we just need to ensure our 4276 * software model is up-to-date. 4277 */ 4278 if (enable_ept && is_pae_paging(vcpu)) 4279 ept_save_pdptrs(vcpu); 4280 4281 kvm_mmu_reset_context(vcpu); 4282 4283 if (cpu_has_vmx_msr_bitmap()) 4284 vmx_update_msr_bitmap(vcpu); 4285 4286 /* 4287 * This nasty bit of open coding is a compromise between blindly 4288 * loading L1's MSRs using the exit load lists (incorrect emulation 4289 * of VMFail), leaving the nested VM's MSRs in the software model 4290 * (incorrect behavior) and snapshotting the modified MSRs (too 4291 * expensive since the lists are unbound by hardware). For each 4292 * MSR that was (prematurely) loaded from the nested VMEntry load 4293 * list, reload it from the exit load list if it exists and differs 4294 * from the guest value. The intent is to stuff host state as 4295 * silently as possible, not to fully process the exit load list. 4296 */ 4297 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4298 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4299 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4300 pr_debug_ratelimited( 4301 "%s read MSR index failed (%u, 0x%08llx)\n", 4302 __func__, i, gpa); 4303 goto vmabort; 4304 } 4305 4306 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4307 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4308 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4309 pr_debug_ratelimited( 4310 "%s read MSR failed (%u, 0x%08llx)\n", 4311 __func__, j, gpa); 4312 goto vmabort; 4313 } 4314 if (h.index != g.index) 4315 continue; 4316 if (h.value == g.value) 4317 break; 4318 4319 if (nested_vmx_load_msr_check(vcpu, &h)) { 4320 pr_debug_ratelimited( 4321 "%s check failed (%u, 0x%x, 0x%x)\n", 4322 __func__, j, h.index, h.reserved); 4323 goto vmabort; 4324 } 4325 4326 if (kvm_set_msr(vcpu, h.index, h.value)) { 4327 pr_debug_ratelimited( 4328 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4329 __func__, j, h.index, h.value); 4330 goto vmabort; 4331 } 4332 } 4333 } 4334 4335 return; 4336 4337 vmabort: 4338 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4339 } 4340 4341 /* 4342 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4343 * and modify vmcs12 to make it see what it would expect to see there if 4344 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4345 */ 4346 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4347 u32 exit_intr_info, unsigned long exit_qualification) 4348 { 4349 struct vcpu_vmx *vmx = to_vmx(vcpu); 4350 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4351 4352 /* trying to cancel vmlaunch/vmresume is a bug */ 4353 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4354 4355 /* Service the TLB flush request for L2 before switching to L1. */ 4356 if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) 4357 kvm_vcpu_flush_tlb_current(vcpu); 4358 4359 leave_guest_mode(vcpu); 4360 4361 if (nested_cpu_has_preemption_timer(vmcs12)) 4362 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4363 4364 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 4365 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4366 4367 if (likely(!vmx->fail)) { 4368 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4369 4370 if (vm_exit_reason != -1) 4371 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4372 exit_intr_info, exit_qualification); 4373 4374 /* 4375 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4376 * also be used to capture vmcs12 cache as part of 4377 * capturing nVMX state for snapshot (migration). 4378 * 4379 * Otherwise, this flush will dirty guest memory at a 4380 * point it is already assumed by user-space to be 4381 * immutable. 4382 */ 4383 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4384 } else { 4385 /* 4386 * The only expected VM-instruction error is "VM entry with 4387 * invalid control field(s)." Anything else indicates a 4388 * problem with L0. And we should never get here with a 4389 * VMFail of any type if early consistency checks are enabled. 4390 */ 4391 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4392 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4393 WARN_ON_ONCE(nested_early_check); 4394 } 4395 4396 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4397 4398 /* Update any VMCS fields that might have changed while L2 ran */ 4399 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4400 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4401 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4402 if (vmx->nested.l1_tpr_threshold != -1) 4403 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4404 4405 if (kvm_has_tsc_control) 4406 decache_tsc_multiplier(vmx); 4407 4408 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4409 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4410 vmx_set_virtual_apic_mode(vcpu); 4411 } 4412 4413 /* Unpin physical memory we referred to in vmcs02 */ 4414 if (vmx->nested.apic_access_page) { 4415 kvm_release_page_clean(vmx->nested.apic_access_page); 4416 vmx->nested.apic_access_page = NULL; 4417 } 4418 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4419 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4420 vmx->nested.pi_desc = NULL; 4421 4422 if (vmx->nested.reload_vmcs01_apic_access_page) { 4423 vmx->nested.reload_vmcs01_apic_access_page = false; 4424 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4425 } 4426 4427 if ((vm_exit_reason != -1) && 4428 (enable_shadow_vmcs || vmx->nested.hv_evmcs)) 4429 vmx->nested.need_vmcs12_to_shadow_sync = true; 4430 4431 /* in case we halted in L2 */ 4432 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4433 4434 if (likely(!vmx->fail)) { 4435 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4436 nested_exit_intr_ack_set(vcpu)) { 4437 int irq = kvm_cpu_get_interrupt(vcpu); 4438 WARN_ON(irq < 0); 4439 vmcs12->vm_exit_intr_info = irq | 4440 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4441 } 4442 4443 if (vm_exit_reason != -1) 4444 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4445 vmcs12->exit_qualification, 4446 vmcs12->idt_vectoring_info_field, 4447 vmcs12->vm_exit_intr_info, 4448 vmcs12->vm_exit_intr_error_code, 4449 KVM_ISA_VMX); 4450 4451 load_vmcs12_host_state(vcpu, vmcs12); 4452 4453 return; 4454 } 4455 4456 /* 4457 * After an early L2 VM-entry failure, we're now back 4458 * in L1 which thinks it just finished a VMLAUNCH or 4459 * VMRESUME instruction, so we need to set the failure 4460 * flag and the VM-instruction error field of the VMCS 4461 * accordingly, and skip the emulated instruction. 4462 */ 4463 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4464 4465 /* 4466 * Restore L1's host state to KVM's software model. We're here 4467 * because a consistency check was caught by hardware, which 4468 * means some amount of guest state has been propagated to KVM's 4469 * model and needs to be unwound to the host's state. 4470 */ 4471 nested_vmx_restore_host_state(vcpu); 4472 4473 vmx->fail = 0; 4474 } 4475 4476 /* 4477 * Decode the memory-address operand of a vmx instruction, as recorded on an 4478 * exit caused by such an instruction (run by a guest hypervisor). 4479 * On success, returns 0. When the operand is invalid, returns 1 and throws 4480 * #UD, #GP, or #SS. 4481 */ 4482 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4483 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4484 { 4485 gva_t off; 4486 bool exn; 4487 struct kvm_segment s; 4488 4489 /* 4490 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4491 * Execution", on an exit, vmx_instruction_info holds most of the 4492 * addressing components of the operand. Only the displacement part 4493 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4494 * For how an actual address is calculated from all these components, 4495 * refer to Vol. 1, "Operand Addressing". 4496 */ 4497 int scaling = vmx_instruction_info & 3; 4498 int addr_size = (vmx_instruction_info >> 7) & 7; 4499 bool is_reg = vmx_instruction_info & (1u << 10); 4500 int seg_reg = (vmx_instruction_info >> 15) & 7; 4501 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4502 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4503 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4504 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4505 4506 if (is_reg) { 4507 kvm_queue_exception(vcpu, UD_VECTOR); 4508 return 1; 4509 } 4510 4511 /* Addr = segment_base + offset */ 4512 /* offset = base + [index * scale] + displacement */ 4513 off = exit_qualification; /* holds the displacement */ 4514 if (addr_size == 1) 4515 off = (gva_t)sign_extend64(off, 31); 4516 else if (addr_size == 0) 4517 off = (gva_t)sign_extend64(off, 15); 4518 if (base_is_valid) 4519 off += kvm_register_read(vcpu, base_reg); 4520 if (index_is_valid) 4521 off += kvm_register_read(vcpu, index_reg) << scaling; 4522 vmx_get_segment(vcpu, &s, seg_reg); 4523 4524 /* 4525 * The effective address, i.e. @off, of a memory operand is truncated 4526 * based on the address size of the instruction. Note that this is 4527 * the *effective address*, i.e. the address prior to accounting for 4528 * the segment's base. 4529 */ 4530 if (addr_size == 1) /* 32 bit */ 4531 off &= 0xffffffff; 4532 else if (addr_size == 0) /* 16 bit */ 4533 off &= 0xffff; 4534 4535 /* Checks for #GP/#SS exceptions. */ 4536 exn = false; 4537 if (is_long_mode(vcpu)) { 4538 /* 4539 * The virtual/linear address is never truncated in 64-bit 4540 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4541 * address when using FS/GS with a non-zero base. 4542 */ 4543 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4544 *ret = s.base + off; 4545 else 4546 *ret = off; 4547 4548 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4549 * non-canonical form. This is the only check on the memory 4550 * destination for long mode! 4551 */ 4552 exn = is_noncanonical_address(*ret, vcpu); 4553 } else { 4554 /* 4555 * When not in long mode, the virtual/linear address is 4556 * unconditionally truncated to 32 bits regardless of the 4557 * address size. 4558 */ 4559 *ret = (s.base + off) & 0xffffffff; 4560 4561 /* Protected mode: apply checks for segment validity in the 4562 * following order: 4563 * - segment type check (#GP(0) may be thrown) 4564 * - usability check (#GP(0)/#SS(0)) 4565 * - limit check (#GP(0)/#SS(0)) 4566 */ 4567 if (wr) 4568 /* #GP(0) if the destination operand is located in a 4569 * read-only data segment or any code segment. 4570 */ 4571 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4572 else 4573 /* #GP(0) if the source operand is located in an 4574 * execute-only code segment 4575 */ 4576 exn = ((s.type & 0xa) == 8); 4577 if (exn) { 4578 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4579 return 1; 4580 } 4581 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4582 */ 4583 exn = (s.unusable != 0); 4584 4585 /* 4586 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4587 * outside the segment limit. All CPUs that support VMX ignore 4588 * limit checks for flat segments, i.e. segments with base==0, 4589 * limit==0xffffffff and of type expand-up data or code. 4590 */ 4591 if (!(s.base == 0 && s.limit == 0xffffffff && 4592 ((s.type & 8) || !(s.type & 4)))) 4593 exn = exn || ((u64)off + len - 1 > s.limit); 4594 } 4595 if (exn) { 4596 kvm_queue_exception_e(vcpu, 4597 seg_reg == VCPU_SREG_SS ? 4598 SS_VECTOR : GP_VECTOR, 4599 0); 4600 return 1; 4601 } 4602 4603 return 0; 4604 } 4605 4606 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 4607 { 4608 struct vcpu_vmx *vmx; 4609 4610 if (!nested_vmx_allowed(vcpu)) 4611 return; 4612 4613 vmx = to_vmx(vcpu); 4614 if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { 4615 vmx->nested.msrs.entry_ctls_high |= 4616 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4617 vmx->nested.msrs.exit_ctls_high |= 4618 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4619 } else { 4620 vmx->nested.msrs.entry_ctls_high &= 4621 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4622 vmx->nested.msrs.exit_ctls_high &= 4623 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4624 } 4625 } 4626 4627 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 4628 int *ret) 4629 { 4630 gva_t gva; 4631 struct x86_exception e; 4632 int r; 4633 4634 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 4635 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4636 sizeof(*vmpointer), &gva)) { 4637 *ret = 1; 4638 return -EINVAL; 4639 } 4640 4641 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 4642 if (r != X86EMUL_CONTINUE) { 4643 *ret = vmx_handle_memory_failure(vcpu, r, &e); 4644 return -EINVAL; 4645 } 4646 4647 return 0; 4648 } 4649 4650 /* 4651 * Allocate a shadow VMCS and associate it with the currently loaded 4652 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4653 * VMCS is also VMCLEARed, so that it is ready for use. 4654 */ 4655 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4656 { 4657 struct vcpu_vmx *vmx = to_vmx(vcpu); 4658 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4659 4660 /* 4661 * We should allocate a shadow vmcs for vmcs01 only when L1 4662 * executes VMXON and free it when L1 executes VMXOFF. 4663 * As it is invalid to execute VMXON twice, we shouldn't reach 4664 * here when vmcs01 already have an allocated shadow vmcs. 4665 */ 4666 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4667 4668 if (!loaded_vmcs->shadow_vmcs) { 4669 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4670 if (loaded_vmcs->shadow_vmcs) 4671 vmcs_clear(loaded_vmcs->shadow_vmcs); 4672 } 4673 return loaded_vmcs->shadow_vmcs; 4674 } 4675 4676 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4677 { 4678 struct vcpu_vmx *vmx = to_vmx(vcpu); 4679 int r; 4680 4681 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4682 if (r < 0) 4683 goto out_vmcs02; 4684 4685 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4686 if (!vmx->nested.cached_vmcs12) 4687 goto out_cached_vmcs12; 4688 4689 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4690 if (!vmx->nested.cached_shadow_vmcs12) 4691 goto out_cached_shadow_vmcs12; 4692 4693 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4694 goto out_shadow_vmcs; 4695 4696 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4697 HRTIMER_MODE_ABS_PINNED); 4698 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4699 4700 vmx->nested.vpid02 = allocate_vpid(); 4701 4702 vmx->nested.vmcs02_initialized = false; 4703 vmx->nested.vmxon = true; 4704 4705 if (vmx_pt_mode_is_host_guest()) { 4706 vmx->pt_desc.guest.ctl = 0; 4707 pt_update_intercept_for_msr(vmx); 4708 } 4709 4710 return 0; 4711 4712 out_shadow_vmcs: 4713 kfree(vmx->nested.cached_shadow_vmcs12); 4714 4715 out_cached_shadow_vmcs12: 4716 kfree(vmx->nested.cached_vmcs12); 4717 4718 out_cached_vmcs12: 4719 free_loaded_vmcs(&vmx->nested.vmcs02); 4720 4721 out_vmcs02: 4722 return -ENOMEM; 4723 } 4724 4725 /* 4726 * Emulate the VMXON instruction. 4727 * Currently, we just remember that VMX is active, and do not save or even 4728 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4729 * do not currently need to store anything in that guest-allocated memory 4730 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4731 * argument is different from the VMXON pointer (which the spec says they do). 4732 */ 4733 static int handle_vmon(struct kvm_vcpu *vcpu) 4734 { 4735 int ret; 4736 gpa_t vmptr; 4737 uint32_t revision; 4738 struct vcpu_vmx *vmx = to_vmx(vcpu); 4739 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 4740 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 4741 4742 /* 4743 * The Intel VMX Instruction Reference lists a bunch of bits that are 4744 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4745 * 1 (see vmx_set_cr4() for when we allow the guest to set this). 4746 * Otherwise, we should fail with #UD. But most faulting conditions 4747 * have already been checked by hardware, prior to the VM-exit for 4748 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4749 * that bit set to 1 in non-root mode. 4750 */ 4751 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4752 kvm_queue_exception(vcpu, UD_VECTOR); 4753 return 1; 4754 } 4755 4756 /* CPL=0 must be checked manually. */ 4757 if (vmx_get_cpl(vcpu)) { 4758 kvm_inject_gp(vcpu, 0); 4759 return 1; 4760 } 4761 4762 if (vmx->nested.vmxon) 4763 return nested_vmx_failValid(vcpu, 4764 VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4765 4766 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4767 != VMXON_NEEDED_FEATURES) { 4768 kvm_inject_gp(vcpu, 0); 4769 return 1; 4770 } 4771 4772 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 4773 return ret; 4774 4775 /* 4776 * SDM 3: 24.11.5 4777 * The first 4 bytes of VMXON region contain the supported 4778 * VMCS revision identifier 4779 * 4780 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4781 * which replaces physical address width with 32 4782 */ 4783 if (!page_address_valid(vcpu, vmptr)) 4784 return nested_vmx_failInvalid(vcpu); 4785 4786 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4787 revision != VMCS12_REVISION) 4788 return nested_vmx_failInvalid(vcpu); 4789 4790 vmx->nested.vmxon_ptr = vmptr; 4791 ret = enter_vmx_operation(vcpu); 4792 if (ret) 4793 return ret; 4794 4795 return nested_vmx_succeed(vcpu); 4796 } 4797 4798 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4799 { 4800 struct vcpu_vmx *vmx = to_vmx(vcpu); 4801 4802 if (vmx->nested.current_vmptr == -1ull) 4803 return; 4804 4805 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4806 4807 if (enable_shadow_vmcs) { 4808 /* copy to memory all shadowed fields in case 4809 they were modified */ 4810 copy_shadow_to_vmcs12(vmx); 4811 vmx_disable_shadow_vmcs(vmx); 4812 } 4813 vmx->nested.posted_intr_nv = -1; 4814 4815 /* Flush VMCS12 to guest memory */ 4816 kvm_vcpu_write_guest_page(vcpu, 4817 vmx->nested.current_vmptr >> PAGE_SHIFT, 4818 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4819 4820 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4821 4822 vmx->nested.current_vmptr = -1ull; 4823 } 4824 4825 /* Emulate the VMXOFF instruction */ 4826 static int handle_vmoff(struct kvm_vcpu *vcpu) 4827 { 4828 if (!nested_vmx_check_permission(vcpu)) 4829 return 1; 4830 4831 free_nested(vcpu); 4832 4833 /* Process a latched INIT during time CPU was in VMX operation */ 4834 kvm_make_request(KVM_REQ_EVENT, vcpu); 4835 4836 return nested_vmx_succeed(vcpu); 4837 } 4838 4839 /* Emulate the VMCLEAR instruction */ 4840 static int handle_vmclear(struct kvm_vcpu *vcpu) 4841 { 4842 struct vcpu_vmx *vmx = to_vmx(vcpu); 4843 u32 zero = 0; 4844 gpa_t vmptr; 4845 u64 evmcs_gpa; 4846 int r; 4847 4848 if (!nested_vmx_check_permission(vcpu)) 4849 return 1; 4850 4851 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 4852 return r; 4853 4854 if (!page_address_valid(vcpu, vmptr)) 4855 return nested_vmx_failValid(vcpu, 4856 VMXERR_VMCLEAR_INVALID_ADDRESS); 4857 4858 if (vmptr == vmx->nested.vmxon_ptr) 4859 return nested_vmx_failValid(vcpu, 4860 VMXERR_VMCLEAR_VMXON_POINTER); 4861 4862 /* 4863 * When Enlightened VMEntry is enabled on the calling CPU we treat 4864 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 4865 * way to distinguish it from VMCS12) and we must not corrupt it by 4866 * writing to the non-existent 'launch_state' field. The area doesn't 4867 * have to be the currently active EVMCS on the calling CPU and there's 4868 * nothing KVM has to do to transition it from 'active' to 'non-active' 4869 * state. It is possible that the area will stay mapped as 4870 * vmx->nested.hv_evmcs but this shouldn't be a problem. 4871 */ 4872 if (likely(!vmx->nested.enlightened_vmcs_enabled || 4873 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 4874 if (vmptr == vmx->nested.current_vmptr) 4875 nested_release_vmcs12(vcpu); 4876 4877 kvm_vcpu_write_guest(vcpu, 4878 vmptr + offsetof(struct vmcs12, 4879 launch_state), 4880 &zero, sizeof(zero)); 4881 } 4882 4883 return nested_vmx_succeed(vcpu); 4884 } 4885 4886 /* Emulate the VMLAUNCH instruction */ 4887 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 4888 { 4889 return nested_vmx_run(vcpu, true); 4890 } 4891 4892 /* Emulate the VMRESUME instruction */ 4893 static int handle_vmresume(struct kvm_vcpu *vcpu) 4894 { 4895 4896 return nested_vmx_run(vcpu, false); 4897 } 4898 4899 static int handle_vmread(struct kvm_vcpu *vcpu) 4900 { 4901 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 4902 : get_vmcs12(vcpu); 4903 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 4904 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4905 struct vcpu_vmx *vmx = to_vmx(vcpu); 4906 struct x86_exception e; 4907 unsigned long field; 4908 u64 value; 4909 gva_t gva = 0; 4910 short offset; 4911 int len, r; 4912 4913 if (!nested_vmx_check_permission(vcpu)) 4914 return 1; 4915 4916 /* 4917 * In VMX non-root operation, when the VMCS-link pointer is -1ull, 4918 * any VMREAD sets the ALU flags for VMfailInvalid. 4919 */ 4920 if (vmx->nested.current_vmptr == -1ull || 4921 (is_guest_mode(vcpu) && 4922 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 4923 return nested_vmx_failInvalid(vcpu); 4924 4925 /* Decode instruction info and find the field to read */ 4926 field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); 4927 4928 offset = vmcs_field_to_offset(field); 4929 if (offset < 0) 4930 return nested_vmx_failValid(vcpu, 4931 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4932 4933 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 4934 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4935 4936 /* Read the field, zero-extended to a u64 value */ 4937 value = vmcs12_read_any(vmcs12, field, offset); 4938 4939 /* 4940 * Now copy part of this value to register or memory, as requested. 4941 * Note that the number of bits actually copied is 32 or 64 depending 4942 * on the guest's mode (32 or 64 bit), not on the given field's length. 4943 */ 4944 if (instr_info & BIT(10)) { 4945 kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value); 4946 } else { 4947 len = is_64_bit_mode(vcpu) ? 8 : 4; 4948 if (get_vmx_mem_address(vcpu, exit_qualification, 4949 instr_info, true, len, &gva)) 4950 return 1; 4951 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4952 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 4953 if (r != X86EMUL_CONTINUE) 4954 return vmx_handle_memory_failure(vcpu, r, &e); 4955 } 4956 4957 return nested_vmx_succeed(vcpu); 4958 } 4959 4960 static bool is_shadow_field_rw(unsigned long field) 4961 { 4962 switch (field) { 4963 #define SHADOW_FIELD_RW(x, y) case x: 4964 #include "vmcs_shadow_fields.h" 4965 return true; 4966 default: 4967 break; 4968 } 4969 return false; 4970 } 4971 4972 static bool is_shadow_field_ro(unsigned long field) 4973 { 4974 switch (field) { 4975 #define SHADOW_FIELD_RO(x, y) case x: 4976 #include "vmcs_shadow_fields.h" 4977 return true; 4978 default: 4979 break; 4980 } 4981 return false; 4982 } 4983 4984 static int handle_vmwrite(struct kvm_vcpu *vcpu) 4985 { 4986 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 4987 : get_vmcs12(vcpu); 4988 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 4989 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4990 struct vcpu_vmx *vmx = to_vmx(vcpu); 4991 struct x86_exception e; 4992 unsigned long field; 4993 short offset; 4994 gva_t gva; 4995 int len, r; 4996 4997 /* 4998 * The value to write might be 32 or 64 bits, depending on L1's long 4999 * mode, and eventually we need to write that into a field of several 5000 * possible lengths. The code below first zero-extends the value to 64 5001 * bit (value), and then copies only the appropriate number of 5002 * bits into the vmcs12 field. 5003 */ 5004 u64 value = 0; 5005 5006 if (!nested_vmx_check_permission(vcpu)) 5007 return 1; 5008 5009 /* 5010 * In VMX non-root operation, when the VMCS-link pointer is -1ull, 5011 * any VMWRITE sets the ALU flags for VMfailInvalid. 5012 */ 5013 if (vmx->nested.current_vmptr == -1ull || 5014 (is_guest_mode(vcpu) && 5015 get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)) 5016 return nested_vmx_failInvalid(vcpu); 5017 5018 if (instr_info & BIT(10)) 5019 value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf)); 5020 else { 5021 len = is_64_bit_mode(vcpu) ? 8 : 4; 5022 if (get_vmx_mem_address(vcpu, exit_qualification, 5023 instr_info, false, len, &gva)) 5024 return 1; 5025 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5026 if (r != X86EMUL_CONTINUE) 5027 return vmx_handle_memory_failure(vcpu, r, &e); 5028 } 5029 5030 field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf)); 5031 5032 offset = vmcs_field_to_offset(field); 5033 if (offset < 0) 5034 return nested_vmx_failValid(vcpu, 5035 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5036 5037 /* 5038 * If the vCPU supports "VMWRITE to any supported field in the 5039 * VMCS," then the "read-only" fields are actually read/write. 5040 */ 5041 if (vmcs_field_readonly(field) && 5042 !nested_cpu_has_vmwrite_any_field(vcpu)) 5043 return nested_vmx_failValid(vcpu, 5044 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5045 5046 /* 5047 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5048 * vmcs12, else we may crush a field or consume a stale value. 5049 */ 5050 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5051 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5052 5053 /* 5054 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5055 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5056 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5057 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5058 * from L1 will return a different value than VMREAD from L2 (L1 sees 5059 * the stripped down value, L2 sees the full value as stored by KVM). 5060 */ 5061 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5062 value &= 0x1f0ff; 5063 5064 vmcs12_write_any(vmcs12, field, offset, value); 5065 5066 /* 5067 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5068 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5069 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5070 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5071 */ 5072 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5073 /* 5074 * L1 can read these fields without exiting, ensure the 5075 * shadow VMCS is up-to-date. 5076 */ 5077 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5078 preempt_disable(); 5079 vmcs_load(vmx->vmcs01.shadow_vmcs); 5080 5081 __vmcs_writel(field, value); 5082 5083 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5084 vmcs_load(vmx->loaded_vmcs->vmcs); 5085 preempt_enable(); 5086 } 5087 vmx->nested.dirty_vmcs12 = true; 5088 } 5089 5090 return nested_vmx_succeed(vcpu); 5091 } 5092 5093 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5094 { 5095 vmx->nested.current_vmptr = vmptr; 5096 if (enable_shadow_vmcs) { 5097 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5098 vmcs_write64(VMCS_LINK_POINTER, 5099 __pa(vmx->vmcs01.shadow_vmcs)); 5100 vmx->nested.need_vmcs12_to_shadow_sync = true; 5101 } 5102 vmx->nested.dirty_vmcs12 = true; 5103 } 5104 5105 /* Emulate the VMPTRLD instruction */ 5106 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5107 { 5108 struct vcpu_vmx *vmx = to_vmx(vcpu); 5109 gpa_t vmptr; 5110 int r; 5111 5112 if (!nested_vmx_check_permission(vcpu)) 5113 return 1; 5114 5115 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5116 return r; 5117 5118 if (!page_address_valid(vcpu, vmptr)) 5119 return nested_vmx_failValid(vcpu, 5120 VMXERR_VMPTRLD_INVALID_ADDRESS); 5121 5122 if (vmptr == vmx->nested.vmxon_ptr) 5123 return nested_vmx_failValid(vcpu, 5124 VMXERR_VMPTRLD_VMXON_POINTER); 5125 5126 /* Forbid normal VMPTRLD if Enlightened version was used */ 5127 if (vmx->nested.hv_evmcs) 5128 return 1; 5129 5130 if (vmx->nested.current_vmptr != vmptr) { 5131 struct kvm_host_map map; 5132 struct vmcs12 *new_vmcs12; 5133 5134 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 5135 /* 5136 * Reads from an unbacked page return all 1s, 5137 * which means that the 32 bits located at the 5138 * given physical address won't match the required 5139 * VMCS12_REVISION identifier. 5140 */ 5141 return nested_vmx_failValid(vcpu, 5142 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5143 } 5144 5145 new_vmcs12 = map.hva; 5146 5147 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 5148 (new_vmcs12->hdr.shadow_vmcs && 5149 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5150 kvm_vcpu_unmap(vcpu, &map, false); 5151 return nested_vmx_failValid(vcpu, 5152 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5153 } 5154 5155 nested_release_vmcs12(vcpu); 5156 5157 /* 5158 * Load VMCS12 from guest memory since it is not already 5159 * cached. 5160 */ 5161 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 5162 kvm_vcpu_unmap(vcpu, &map, false); 5163 5164 set_current_vmptr(vmx, vmptr); 5165 } 5166 5167 return nested_vmx_succeed(vcpu); 5168 } 5169 5170 /* Emulate the VMPTRST instruction */ 5171 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5172 { 5173 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5174 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5175 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5176 struct x86_exception e; 5177 gva_t gva; 5178 int r; 5179 5180 if (!nested_vmx_check_permission(vcpu)) 5181 return 1; 5182 5183 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) 5184 return 1; 5185 5186 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5187 true, sizeof(gpa_t), &gva)) 5188 return 1; 5189 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5190 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5191 sizeof(gpa_t), &e); 5192 if (r != X86EMUL_CONTINUE) 5193 return vmx_handle_memory_failure(vcpu, r, &e); 5194 5195 return nested_vmx_succeed(vcpu); 5196 } 5197 5198 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 5199 5200 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 5201 { 5202 return VALID_PAGE(root_hpa) && 5203 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 5204 } 5205 5206 /* Emulate the INVEPT instruction */ 5207 static int handle_invept(struct kvm_vcpu *vcpu) 5208 { 5209 struct vcpu_vmx *vmx = to_vmx(vcpu); 5210 u32 vmx_instruction_info, types; 5211 unsigned long type, roots_to_free; 5212 struct kvm_mmu *mmu; 5213 gva_t gva; 5214 struct x86_exception e; 5215 struct { 5216 u64 eptp, gpa; 5217 } operand; 5218 int i, r; 5219 5220 if (!(vmx->nested.msrs.secondary_ctls_high & 5221 SECONDARY_EXEC_ENABLE_EPT) || 5222 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5223 kvm_queue_exception(vcpu, UD_VECTOR); 5224 return 1; 5225 } 5226 5227 if (!nested_vmx_check_permission(vcpu)) 5228 return 1; 5229 5230 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5231 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 5232 5233 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5234 5235 if (type >= 32 || !(types & (1 << type))) 5236 return nested_vmx_failValid(vcpu, 5237 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5238 5239 /* According to the Intel VMX instruction reference, the memory 5240 * operand is read even if it isn't needed (e.g., for type==global) 5241 */ 5242 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5243 vmx_instruction_info, false, sizeof(operand), &gva)) 5244 return 1; 5245 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5246 if (r != X86EMUL_CONTINUE) 5247 return vmx_handle_memory_failure(vcpu, r, &e); 5248 5249 /* 5250 * Nested EPT roots are always held through guest_mmu, 5251 * not root_mmu. 5252 */ 5253 mmu = &vcpu->arch.guest_mmu; 5254 5255 switch (type) { 5256 case VMX_EPT_EXTENT_CONTEXT: 5257 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5258 return nested_vmx_failValid(vcpu, 5259 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5260 5261 roots_to_free = 0; 5262 if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, 5263 operand.eptp)) 5264 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5265 5266 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5267 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5268 mmu->prev_roots[i].pgd, 5269 operand.eptp)) 5270 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5271 } 5272 break; 5273 case VMX_EPT_EXTENT_GLOBAL: 5274 roots_to_free = KVM_MMU_ROOTS_ALL; 5275 break; 5276 default: 5277 BUG(); 5278 break; 5279 } 5280 5281 if (roots_to_free) 5282 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 5283 5284 return nested_vmx_succeed(vcpu); 5285 } 5286 5287 static int handle_invvpid(struct kvm_vcpu *vcpu) 5288 { 5289 struct vcpu_vmx *vmx = to_vmx(vcpu); 5290 u32 vmx_instruction_info; 5291 unsigned long type, types; 5292 gva_t gva; 5293 struct x86_exception e; 5294 struct { 5295 u64 vpid; 5296 u64 gla; 5297 } operand; 5298 u16 vpid02; 5299 int r; 5300 5301 if (!(vmx->nested.msrs.secondary_ctls_high & 5302 SECONDARY_EXEC_ENABLE_VPID) || 5303 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5304 kvm_queue_exception(vcpu, UD_VECTOR); 5305 return 1; 5306 } 5307 5308 if (!nested_vmx_check_permission(vcpu)) 5309 return 1; 5310 5311 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5312 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 5313 5314 types = (vmx->nested.msrs.vpid_caps & 5315 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5316 5317 if (type >= 32 || !(types & (1 << type))) 5318 return nested_vmx_failValid(vcpu, 5319 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5320 5321 /* according to the intel vmx instruction reference, the memory 5322 * operand is read even if it isn't needed (e.g., for type==global) 5323 */ 5324 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5325 vmx_instruction_info, false, sizeof(operand), &gva)) 5326 return 1; 5327 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5328 if (r != X86EMUL_CONTINUE) 5329 return vmx_handle_memory_failure(vcpu, r, &e); 5330 5331 if (operand.vpid >> 16) 5332 return nested_vmx_failValid(vcpu, 5333 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5334 5335 vpid02 = nested_get_vpid02(vcpu); 5336 switch (type) { 5337 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5338 if (!operand.vpid || 5339 is_noncanonical_address(operand.gla, vcpu)) 5340 return nested_vmx_failValid(vcpu, 5341 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5342 vpid_sync_vcpu_addr(vpid02, operand.gla); 5343 break; 5344 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5345 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5346 if (!operand.vpid) 5347 return nested_vmx_failValid(vcpu, 5348 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5349 vpid_sync_context(vpid02); 5350 break; 5351 case VMX_VPID_EXTENT_ALL_CONTEXT: 5352 vpid_sync_context(vpid02); 5353 break; 5354 default: 5355 WARN_ON_ONCE(1); 5356 return kvm_skip_emulated_instruction(vcpu); 5357 } 5358 5359 /* 5360 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5361 * linear mappings for L2 (tagged with L2's VPID). Free all roots as 5362 * VPIDs are not tracked in the MMU role. 5363 * 5364 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5365 * an MMU when EPT is disabled. 5366 * 5367 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5368 */ 5369 if (!enable_ept) 5370 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, 5371 KVM_MMU_ROOTS_ALL); 5372 5373 return nested_vmx_succeed(vcpu); 5374 } 5375 5376 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5377 struct vmcs12 *vmcs12) 5378 { 5379 u32 index = kvm_rcx_read(vcpu); 5380 u64 new_eptp; 5381 bool accessed_dirty; 5382 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 5383 5384 if (!nested_cpu_has_eptp_switching(vmcs12) || 5385 !nested_cpu_has_ept(vmcs12)) 5386 return 1; 5387 5388 if (index >= VMFUNC_EPTP_ENTRIES) 5389 return 1; 5390 5391 5392 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5393 &new_eptp, index * 8, 8)) 5394 return 1; 5395 5396 accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT); 5397 5398 /* 5399 * If the (L2) guest does a vmfunc to the currently 5400 * active ept pointer, we don't have to do anything else 5401 */ 5402 if (vmcs12->ept_pointer != new_eptp) { 5403 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5404 return 1; 5405 5406 kvm_mmu_unload(vcpu); 5407 mmu->ept_ad = accessed_dirty; 5408 mmu->mmu_role.base.ad_disabled = !accessed_dirty; 5409 vmcs12->ept_pointer = new_eptp; 5410 /* 5411 * TODO: Check what's the correct approach in case 5412 * mmu reload fails. Currently, we just let the next 5413 * reload potentially fail 5414 */ 5415 kvm_mmu_reload(vcpu); 5416 } 5417 5418 return 0; 5419 } 5420 5421 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5422 { 5423 struct vcpu_vmx *vmx = to_vmx(vcpu); 5424 struct vmcs12 *vmcs12; 5425 u32 function = kvm_rax_read(vcpu); 5426 5427 /* 5428 * VMFUNC is only supported for nested guests, but we always enable the 5429 * secondary control for simplicity; for non-nested mode, fake that we 5430 * didn't by injecting #UD. 5431 */ 5432 if (!is_guest_mode(vcpu)) { 5433 kvm_queue_exception(vcpu, UD_VECTOR); 5434 return 1; 5435 } 5436 5437 vmcs12 = get_vmcs12(vcpu); 5438 if ((vmcs12->vm_function_control & (1 << function)) == 0) 5439 goto fail; 5440 5441 switch (function) { 5442 case 0: 5443 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5444 goto fail; 5445 break; 5446 default: 5447 goto fail; 5448 } 5449 return kvm_skip_emulated_instruction(vcpu); 5450 5451 fail: 5452 nested_vmx_vmexit(vcpu, vmx->exit_reason, 5453 vmx_get_intr_info(vcpu), 5454 vmx_get_exit_qual(vcpu)); 5455 return 1; 5456 } 5457 5458 /* 5459 * Return true if an IO instruction with the specified port and size should cause 5460 * a VM-exit into L1. 5461 */ 5462 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5463 int size) 5464 { 5465 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5466 gpa_t bitmap, last_bitmap; 5467 u8 b; 5468 5469 last_bitmap = (gpa_t)-1; 5470 b = -1; 5471 5472 while (size > 0) { 5473 if (port < 0x8000) 5474 bitmap = vmcs12->io_bitmap_a; 5475 else if (port < 0x10000) 5476 bitmap = vmcs12->io_bitmap_b; 5477 else 5478 return true; 5479 bitmap += (port & 0x7fff) / 8; 5480 5481 if (last_bitmap != bitmap) 5482 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5483 return true; 5484 if (b & (1 << (port & 7))) 5485 return true; 5486 5487 port++; 5488 size--; 5489 last_bitmap = bitmap; 5490 } 5491 5492 return false; 5493 } 5494 5495 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5496 struct vmcs12 *vmcs12) 5497 { 5498 unsigned long exit_qualification; 5499 unsigned short port; 5500 int size; 5501 5502 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5503 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5504 5505 exit_qualification = vmx_get_exit_qual(vcpu); 5506 5507 port = exit_qualification >> 16; 5508 size = (exit_qualification & 7) + 1; 5509 5510 return nested_vmx_check_io_bitmaps(vcpu, port, size); 5511 } 5512 5513 /* 5514 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 5515 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5516 * disinterest in the current event (read or write a specific MSR) by using an 5517 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5518 */ 5519 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5520 struct vmcs12 *vmcs12, u32 exit_reason) 5521 { 5522 u32 msr_index = kvm_rcx_read(vcpu); 5523 gpa_t bitmap; 5524 5525 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5526 return true; 5527 5528 /* 5529 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5530 * for the four combinations of read/write and low/high MSR numbers. 5531 * First we need to figure out which of the four to use: 5532 */ 5533 bitmap = vmcs12->msr_bitmap; 5534 if (exit_reason == EXIT_REASON_MSR_WRITE) 5535 bitmap += 2048; 5536 if (msr_index >= 0xc0000000) { 5537 msr_index -= 0xc0000000; 5538 bitmap += 1024; 5539 } 5540 5541 /* Then read the msr_index'th bit from this bitmap: */ 5542 if (msr_index < 1024*8) { 5543 unsigned char b; 5544 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5545 return true; 5546 return 1 & (b >> (msr_index & 7)); 5547 } else 5548 return true; /* let L1 handle the wrong parameter */ 5549 } 5550 5551 /* 5552 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5553 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5554 * intercept (via guest_host_mask etc.) the current event. 5555 */ 5556 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5557 struct vmcs12 *vmcs12) 5558 { 5559 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5560 int cr = exit_qualification & 15; 5561 int reg; 5562 unsigned long val; 5563 5564 switch ((exit_qualification >> 4) & 3) { 5565 case 0: /* mov to cr */ 5566 reg = (exit_qualification >> 8) & 15; 5567 val = kvm_register_readl(vcpu, reg); 5568 switch (cr) { 5569 case 0: 5570 if (vmcs12->cr0_guest_host_mask & 5571 (val ^ vmcs12->cr0_read_shadow)) 5572 return true; 5573 break; 5574 case 3: 5575 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5576 return true; 5577 break; 5578 case 4: 5579 if (vmcs12->cr4_guest_host_mask & 5580 (vmcs12->cr4_read_shadow ^ val)) 5581 return true; 5582 break; 5583 case 8: 5584 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5585 return true; 5586 break; 5587 } 5588 break; 5589 case 2: /* clts */ 5590 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5591 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5592 return true; 5593 break; 5594 case 1: /* mov from cr */ 5595 switch (cr) { 5596 case 3: 5597 if (vmcs12->cpu_based_vm_exec_control & 5598 CPU_BASED_CR3_STORE_EXITING) 5599 return true; 5600 break; 5601 case 8: 5602 if (vmcs12->cpu_based_vm_exec_control & 5603 CPU_BASED_CR8_STORE_EXITING) 5604 return true; 5605 break; 5606 } 5607 break; 5608 case 3: /* lmsw */ 5609 /* 5610 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5611 * cr0. Other attempted changes are ignored, with no exit. 5612 */ 5613 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5614 if (vmcs12->cr0_guest_host_mask & 0xe & 5615 (val ^ vmcs12->cr0_read_shadow)) 5616 return true; 5617 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5618 !(vmcs12->cr0_read_shadow & 0x1) && 5619 (val & 0x1)) 5620 return true; 5621 break; 5622 } 5623 return false; 5624 } 5625 5626 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5627 struct vmcs12 *vmcs12, gpa_t bitmap) 5628 { 5629 u32 vmx_instruction_info; 5630 unsigned long field; 5631 u8 b; 5632 5633 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5634 return true; 5635 5636 /* Decode instruction info and find the field to access */ 5637 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5638 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5639 5640 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5641 if (field >> 15) 5642 return true; 5643 5644 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5645 return true; 5646 5647 return 1 & (b >> (field & 7)); 5648 } 5649 5650 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 5651 { 5652 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 5653 5654 if (nested_cpu_has_mtf(vmcs12)) 5655 return true; 5656 5657 /* 5658 * An MTF VM-exit may be injected into the guest by setting the 5659 * interruption-type to 7 (other event) and the vector field to 0. Such 5660 * is the case regardless of the 'monitor trap flag' VM-execution 5661 * control. 5662 */ 5663 return entry_intr_info == (INTR_INFO_VALID_MASK 5664 | INTR_TYPE_OTHER_EVENT); 5665 } 5666 5667 /* 5668 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 5669 * L1 wants the exit. Only call this when in is_guest_mode (L2). 5670 */ 5671 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) 5672 { 5673 u32 intr_info; 5674 5675 switch ((u16)exit_reason) { 5676 case EXIT_REASON_EXCEPTION_NMI: 5677 intr_info = vmx_get_intr_info(vcpu); 5678 if (is_nmi(intr_info)) 5679 return true; 5680 else if (is_page_fault(intr_info)) 5681 return vcpu->arch.apf.host_apf_flags || !enable_ept; 5682 else if (is_debug(intr_info) && 5683 vcpu->guest_debug & 5684 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5685 return true; 5686 else if (is_breakpoint(intr_info) && 5687 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5688 return true; 5689 return false; 5690 case EXIT_REASON_EXTERNAL_INTERRUPT: 5691 return true; 5692 case EXIT_REASON_MCE_DURING_VMENTRY: 5693 return true; 5694 case EXIT_REASON_EPT_VIOLATION: 5695 /* 5696 * L0 always deals with the EPT violation. If nested EPT is 5697 * used, and the nested mmu code discovers that the address is 5698 * missing in the guest EPT table (EPT12), the EPT violation 5699 * will be injected with nested_ept_inject_page_fault() 5700 */ 5701 return true; 5702 case EXIT_REASON_EPT_MISCONFIG: 5703 /* 5704 * L2 never uses directly L1's EPT, but rather L0's own EPT 5705 * table (shadow on EPT) or a merged EPT table that L0 built 5706 * (EPT on EPT). So any problems with the structure of the 5707 * table is L0's fault. 5708 */ 5709 return true; 5710 case EXIT_REASON_PREEMPTION_TIMER: 5711 return true; 5712 case EXIT_REASON_PML_FULL: 5713 /* We emulate PML support to L1. */ 5714 return true; 5715 case EXIT_REASON_VMFUNC: 5716 /* VM functions are emulated through L2->L0 vmexits. */ 5717 return true; 5718 case EXIT_REASON_ENCLS: 5719 /* SGX is never exposed to L1 */ 5720 return true; 5721 default: 5722 break; 5723 } 5724 return false; 5725 } 5726 5727 /* 5728 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 5729 * is_guest_mode (L2). 5730 */ 5731 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) 5732 { 5733 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5734 u32 intr_info; 5735 5736 switch ((u16)exit_reason) { 5737 case EXIT_REASON_EXCEPTION_NMI: 5738 intr_info = vmx_get_intr_info(vcpu); 5739 if (is_nmi(intr_info)) 5740 return true; 5741 else if (is_page_fault(intr_info)) 5742 return true; 5743 return vmcs12->exception_bitmap & 5744 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5745 case EXIT_REASON_EXTERNAL_INTERRUPT: 5746 return nested_exit_on_intr(vcpu); 5747 case EXIT_REASON_TRIPLE_FAULT: 5748 return true; 5749 case EXIT_REASON_INTERRUPT_WINDOW: 5750 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 5751 case EXIT_REASON_NMI_WINDOW: 5752 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 5753 case EXIT_REASON_TASK_SWITCH: 5754 return true; 5755 case EXIT_REASON_CPUID: 5756 return true; 5757 case EXIT_REASON_HLT: 5758 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5759 case EXIT_REASON_INVD: 5760 return true; 5761 case EXIT_REASON_INVLPG: 5762 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5763 case EXIT_REASON_RDPMC: 5764 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5765 case EXIT_REASON_RDRAND: 5766 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5767 case EXIT_REASON_RDSEED: 5768 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5769 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5770 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5771 case EXIT_REASON_VMREAD: 5772 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5773 vmcs12->vmread_bitmap); 5774 case EXIT_REASON_VMWRITE: 5775 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5776 vmcs12->vmwrite_bitmap); 5777 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5778 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5779 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5780 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5781 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5782 /* 5783 * VMX instructions trap unconditionally. This allows L1 to 5784 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5785 */ 5786 return true; 5787 case EXIT_REASON_CR_ACCESS: 5788 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5789 case EXIT_REASON_DR_ACCESS: 5790 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5791 case EXIT_REASON_IO_INSTRUCTION: 5792 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5793 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5794 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5795 case EXIT_REASON_MSR_READ: 5796 case EXIT_REASON_MSR_WRITE: 5797 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5798 case EXIT_REASON_INVALID_STATE: 5799 return true; 5800 case EXIT_REASON_MWAIT_INSTRUCTION: 5801 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5802 case EXIT_REASON_MONITOR_TRAP_FLAG: 5803 return nested_vmx_exit_handled_mtf(vmcs12); 5804 case EXIT_REASON_MONITOR_INSTRUCTION: 5805 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5806 case EXIT_REASON_PAUSE_INSTRUCTION: 5807 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5808 nested_cpu_has2(vmcs12, 5809 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5810 case EXIT_REASON_MCE_DURING_VMENTRY: 5811 return true; 5812 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5813 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5814 case EXIT_REASON_APIC_ACCESS: 5815 case EXIT_REASON_APIC_WRITE: 5816 case EXIT_REASON_EOI_INDUCED: 5817 /* 5818 * The controls for "virtualize APIC accesses," "APIC- 5819 * register virtualization," and "virtual-interrupt 5820 * delivery" only come from vmcs12. 5821 */ 5822 return true; 5823 case EXIT_REASON_INVPCID: 5824 return 5825 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5826 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5827 case EXIT_REASON_WBINVD: 5828 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5829 case EXIT_REASON_XSETBV: 5830 return true; 5831 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5832 /* 5833 * This should never happen, since it is not possible to 5834 * set XSS to a non-zero value---neither in L1 nor in L2. 5835 * If if it were, XSS would have to be checked against 5836 * the XSS exit bitmap in vmcs12. 5837 */ 5838 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5839 case EXIT_REASON_UMWAIT: 5840 case EXIT_REASON_TPAUSE: 5841 return nested_cpu_has2(vmcs12, 5842 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 5843 default: 5844 return true; 5845 } 5846 } 5847 5848 /* 5849 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 5850 * reflected into L1. 5851 */ 5852 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 5853 { 5854 struct vcpu_vmx *vmx = to_vmx(vcpu); 5855 u32 exit_reason = vmx->exit_reason; 5856 unsigned long exit_qual; 5857 u32 exit_intr_info; 5858 5859 WARN_ON_ONCE(vmx->nested.nested_run_pending); 5860 5861 /* 5862 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 5863 * has already loaded L2's state. 5864 */ 5865 if (unlikely(vmx->fail)) { 5866 trace_kvm_nested_vmenter_failed( 5867 "hardware VM-instruction error: ", 5868 vmcs_read32(VM_INSTRUCTION_ERROR)); 5869 exit_intr_info = 0; 5870 exit_qual = 0; 5871 goto reflect_vmexit; 5872 } 5873 5874 exit_intr_info = vmx_get_intr_info(vcpu); 5875 exit_qual = vmx_get_exit_qual(vcpu); 5876 5877 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, 5878 vmx->idt_vectoring_info, exit_intr_info, 5879 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 5880 KVM_ISA_VMX); 5881 5882 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 5883 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 5884 return false; 5885 5886 /* If L1 doesn't want the exit, handle it in L0. */ 5887 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 5888 return false; 5889 5890 /* 5891 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 5892 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 5893 * need to be synthesized by querying the in-kernel LAPIC, but external 5894 * interrupts are never reflected to L1 so it's a non-issue. 5895 */ 5896 if ((exit_intr_info & 5897 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == 5898 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) { 5899 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5900 5901 vmcs12->vm_exit_intr_error_code = 5902 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5903 } 5904 5905 reflect_vmexit: 5906 nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info, exit_qual); 5907 return true; 5908 } 5909 5910 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 5911 struct kvm_nested_state __user *user_kvm_nested_state, 5912 u32 user_data_size) 5913 { 5914 struct vcpu_vmx *vmx; 5915 struct vmcs12 *vmcs12; 5916 struct kvm_nested_state kvm_state = { 5917 .flags = 0, 5918 .format = KVM_STATE_NESTED_FORMAT_VMX, 5919 .size = sizeof(kvm_state), 5920 .hdr.vmx.flags = 0, 5921 .hdr.vmx.vmxon_pa = -1ull, 5922 .hdr.vmx.vmcs12_pa = -1ull, 5923 .hdr.vmx.preemption_timer_deadline = 0, 5924 }; 5925 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5926 &user_kvm_nested_state->data.vmx[0]; 5927 5928 if (!vcpu) 5929 return kvm_state.size + sizeof(*user_vmx_nested_state); 5930 5931 vmx = to_vmx(vcpu); 5932 vmcs12 = get_vmcs12(vcpu); 5933 5934 if (nested_vmx_allowed(vcpu) && 5935 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 5936 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 5937 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 5938 5939 if (vmx_has_valid_vmcs12(vcpu)) { 5940 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 5941 5942 if (vmx->nested.hv_evmcs) 5943 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 5944 5945 if (is_guest_mode(vcpu) && 5946 nested_cpu_has_shadow_vmcs(vmcs12) && 5947 vmcs12->vmcs_link_pointer != -1ull) 5948 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 5949 } 5950 5951 if (vmx->nested.smm.vmxon) 5952 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 5953 5954 if (vmx->nested.smm.guest_mode) 5955 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 5956 5957 if (is_guest_mode(vcpu)) { 5958 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 5959 5960 if (vmx->nested.nested_run_pending) 5961 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 5962 5963 if (vmx->nested.mtf_pending) 5964 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 5965 5966 if (nested_cpu_has_preemption_timer(vmcs12) && 5967 vmx->nested.has_preemption_timer_deadline) { 5968 kvm_state.hdr.vmx.flags |= 5969 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 5970 kvm_state.hdr.vmx.preemption_timer_deadline = 5971 vmx->nested.preemption_timer_deadline; 5972 } 5973 } 5974 } 5975 5976 if (user_data_size < kvm_state.size) 5977 goto out; 5978 5979 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 5980 return -EFAULT; 5981 5982 if (!vmx_has_valid_vmcs12(vcpu)) 5983 goto out; 5984 5985 /* 5986 * When running L2, the authoritative vmcs12 state is in the 5987 * vmcs02. When running L1, the authoritative vmcs12 state is 5988 * in the shadow or enlightened vmcs linked to vmcs01, unless 5989 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 5990 * vmcs12 state is in the vmcs12 already. 5991 */ 5992 if (is_guest_mode(vcpu)) { 5993 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5994 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5995 } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { 5996 if (vmx->nested.hv_evmcs) 5997 copy_enlightened_to_vmcs12(vmx); 5998 else if (enable_shadow_vmcs) 5999 copy_shadow_to_vmcs12(vmx); 6000 } 6001 6002 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6003 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6004 6005 /* 6006 * Copy over the full allocated size of vmcs12 rather than just the size 6007 * of the struct. 6008 */ 6009 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6010 return -EFAULT; 6011 6012 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6013 vmcs12->vmcs_link_pointer != -1ull) { 6014 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6015 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6016 return -EFAULT; 6017 } 6018 out: 6019 return kvm_state.size; 6020 } 6021 6022 /* 6023 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 6024 */ 6025 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6026 { 6027 if (is_guest_mode(vcpu)) { 6028 to_vmx(vcpu)->nested.nested_run_pending = 0; 6029 nested_vmx_vmexit(vcpu, -1, 0, 0); 6030 } 6031 free_nested(vcpu); 6032 } 6033 6034 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6035 struct kvm_nested_state __user *user_kvm_nested_state, 6036 struct kvm_nested_state *kvm_state) 6037 { 6038 struct vcpu_vmx *vmx = to_vmx(vcpu); 6039 struct vmcs12 *vmcs12; 6040 enum vm_entry_failure_code ignored; 6041 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6042 &user_kvm_nested_state->data.vmx[0]; 6043 int ret; 6044 6045 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6046 return -EINVAL; 6047 6048 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { 6049 if (kvm_state->hdr.vmx.smm.flags) 6050 return -EINVAL; 6051 6052 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) 6053 return -EINVAL; 6054 6055 /* 6056 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6057 * enable eVMCS capability on vCPU. However, since then 6058 * code was changed such that flag signals vmcs12 should 6059 * be copied into eVMCS in guest memory. 6060 * 6061 * To preserve backwards compatability, allow user 6062 * to set this flag even when there is no VMXON region. 6063 */ 6064 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6065 return -EINVAL; 6066 } else { 6067 if (!nested_vmx_allowed(vcpu)) 6068 return -EINVAL; 6069 6070 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6071 return -EINVAL; 6072 } 6073 6074 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6075 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6076 return -EINVAL; 6077 6078 if (kvm_state->hdr.vmx.smm.flags & 6079 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6080 return -EINVAL; 6081 6082 /* 6083 * SMM temporarily disables VMX, so we cannot be in guest mode, 6084 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6085 * must be zero. 6086 */ 6087 if (is_smm(vcpu) ? 6088 (kvm_state->flags & 6089 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6090 : kvm_state->hdr.vmx.smm.flags) 6091 return -EINVAL; 6092 6093 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6094 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6095 return -EINVAL; 6096 6097 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6098 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 6099 return -EINVAL; 6100 6101 vmx_leave_nested(vcpu); 6102 6103 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) 6104 return 0; 6105 6106 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6107 ret = enter_vmx_operation(vcpu); 6108 if (ret) 6109 return ret; 6110 6111 /* Empty 'VMXON' state is permitted */ 6112 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) 6113 return 0; 6114 6115 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { 6116 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6117 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6118 return -EINVAL; 6119 6120 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6121 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6122 /* 6123 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6124 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6125 * restored yet. EVMCS will be mapped from 6126 * nested_get_vmcs12_pages(). 6127 */ 6128 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 6129 } else { 6130 return -EINVAL; 6131 } 6132 6133 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6134 vmx->nested.smm.vmxon = true; 6135 vmx->nested.vmxon = false; 6136 6137 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6138 vmx->nested.smm.guest_mode = true; 6139 } 6140 6141 vmcs12 = get_vmcs12(vcpu); 6142 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6143 return -EFAULT; 6144 6145 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6146 return -EINVAL; 6147 6148 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6149 return 0; 6150 6151 vmx->nested.nested_run_pending = 6152 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6153 6154 vmx->nested.mtf_pending = 6155 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6156 6157 ret = -EINVAL; 6158 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6159 vmcs12->vmcs_link_pointer != -1ull) { 6160 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6161 6162 if (kvm_state->size < 6163 sizeof(*kvm_state) + 6164 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6165 goto error_guest_mode; 6166 6167 if (copy_from_user(shadow_vmcs12, 6168 user_vmx_nested_state->shadow_vmcs12, 6169 sizeof(*shadow_vmcs12))) { 6170 ret = -EFAULT; 6171 goto error_guest_mode; 6172 } 6173 6174 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6175 !shadow_vmcs12->hdr.shadow_vmcs) 6176 goto error_guest_mode; 6177 } 6178 6179 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6180 vmx->nested.has_preemption_timer_deadline = true; 6181 vmx->nested.preemption_timer_deadline = 6182 kvm_state->hdr.vmx.preemption_timer_deadline; 6183 } 6184 6185 if (nested_vmx_check_controls(vcpu, vmcs12) || 6186 nested_vmx_check_host_state(vcpu, vmcs12) || 6187 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6188 goto error_guest_mode; 6189 6190 vmx->nested.dirty_vmcs12 = true; 6191 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6192 if (ret) 6193 goto error_guest_mode; 6194 6195 return 0; 6196 6197 error_guest_mode: 6198 vmx->nested.nested_run_pending = 0; 6199 return ret; 6200 } 6201 6202 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6203 { 6204 if (enable_shadow_vmcs) { 6205 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6206 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6207 } 6208 } 6209 6210 /* 6211 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 6212 * returned for the various VMX controls MSRs when nested VMX is enabled. 6213 * The same values should also be used to verify that vmcs12 control fields are 6214 * valid during nested entry from L1 to L2. 6215 * Each of these control msrs has a low and high 32-bit half: A low bit is on 6216 * if the corresponding bit in the (32-bit) control field *must* be on, and a 6217 * bit in the high half is on if the corresponding bit in the control field 6218 * may be on. See also vmx_control_verify(). 6219 */ 6220 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) 6221 { 6222 /* 6223 * Note that as a general rule, the high half of the MSRs (bits in 6224 * the control fields which may be 1) should be initialized by the 6225 * intersection of the underlying hardware's MSR (i.e., features which 6226 * can be supported) and the list of features we want to expose - 6227 * because they are known to be properly supported in our code. 6228 * Also, usually, the low half of the MSRs (bits which must be 1) can 6229 * be set to 0, meaning that L1 may turn off any of these bits. The 6230 * reason is that if one of these bits is necessary, it will appear 6231 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 6232 * fields of vmcs01 and vmcs02, will turn these bits off - and 6233 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 6234 * These rules have exceptions below. 6235 */ 6236 6237 /* pin-based controls */ 6238 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 6239 msrs->pinbased_ctls_low, 6240 msrs->pinbased_ctls_high); 6241 msrs->pinbased_ctls_low |= 6242 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6243 msrs->pinbased_ctls_high &= 6244 PIN_BASED_EXT_INTR_MASK | 6245 PIN_BASED_NMI_EXITING | 6246 PIN_BASED_VIRTUAL_NMIS | 6247 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6248 msrs->pinbased_ctls_high |= 6249 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6250 PIN_BASED_VMX_PREEMPTION_TIMER; 6251 6252 /* exit controls */ 6253 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 6254 msrs->exit_ctls_low, 6255 msrs->exit_ctls_high); 6256 msrs->exit_ctls_low = 6257 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6258 6259 msrs->exit_ctls_high &= 6260 #ifdef CONFIG_X86_64 6261 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6262 #endif 6263 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 6264 msrs->exit_ctls_high |= 6265 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6266 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6267 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 6268 6269 /* We support free control of debug control saving. */ 6270 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6271 6272 /* entry controls */ 6273 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 6274 msrs->entry_ctls_low, 6275 msrs->entry_ctls_high); 6276 msrs->entry_ctls_low = 6277 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6278 msrs->entry_ctls_high &= 6279 #ifdef CONFIG_X86_64 6280 VM_ENTRY_IA32E_MODE | 6281 #endif 6282 VM_ENTRY_LOAD_IA32_PAT; 6283 msrs->entry_ctls_high |= 6284 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6285 6286 /* We support free control of debug control loading. */ 6287 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6288 6289 /* cpu-based controls */ 6290 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6291 msrs->procbased_ctls_low, 6292 msrs->procbased_ctls_high); 6293 msrs->procbased_ctls_low = 6294 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6295 msrs->procbased_ctls_high &= 6296 CPU_BASED_INTR_WINDOW_EXITING | 6297 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6298 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6299 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6300 CPU_BASED_CR3_STORE_EXITING | 6301 #ifdef CONFIG_X86_64 6302 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6303 #endif 6304 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6305 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6306 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6307 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6308 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6309 /* 6310 * We can allow some features even when not supported by the 6311 * hardware. For example, L1 can specify an MSR bitmap - and we 6312 * can use it to avoid exits to L1 - even when L0 runs L2 6313 * without MSR bitmaps. 6314 */ 6315 msrs->procbased_ctls_high |= 6316 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6317 CPU_BASED_USE_MSR_BITMAPS; 6318 6319 /* We support free control of CR3 access interception. */ 6320 msrs->procbased_ctls_low &= 6321 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6322 6323 /* 6324 * secondary cpu-based controls. Do not include those that 6325 * depend on CPUID bits, they are added later by vmx_cpuid_update. 6326 */ 6327 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6328 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6329 msrs->secondary_ctls_low, 6330 msrs->secondary_ctls_high); 6331 6332 msrs->secondary_ctls_low = 0; 6333 msrs->secondary_ctls_high &= 6334 SECONDARY_EXEC_DESC | 6335 SECONDARY_EXEC_RDTSCP | 6336 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6337 SECONDARY_EXEC_WBINVD_EXITING | 6338 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6339 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6340 SECONDARY_EXEC_RDRAND_EXITING | 6341 SECONDARY_EXEC_ENABLE_INVPCID | 6342 SECONDARY_EXEC_RDSEED_EXITING | 6343 SECONDARY_EXEC_XSAVES; 6344 6345 /* 6346 * We can emulate "VMCS shadowing," even if the hardware 6347 * doesn't support it. 6348 */ 6349 msrs->secondary_ctls_high |= 6350 SECONDARY_EXEC_SHADOW_VMCS; 6351 6352 if (enable_ept) { 6353 /* nested EPT: emulate EPT also to L1 */ 6354 msrs->secondary_ctls_high |= 6355 SECONDARY_EXEC_ENABLE_EPT; 6356 msrs->ept_caps = 6357 VMX_EPT_PAGE_WALK_4_BIT | 6358 VMX_EPT_PAGE_WALK_5_BIT | 6359 VMX_EPTP_WB_BIT | 6360 VMX_EPT_INVEPT_BIT | 6361 VMX_EPT_EXECUTE_ONLY_BIT; 6362 6363 msrs->ept_caps &= ept_caps; 6364 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6365 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6366 VMX_EPT_1GB_PAGE_BIT; 6367 if (enable_ept_ad_bits) { 6368 msrs->secondary_ctls_high |= 6369 SECONDARY_EXEC_ENABLE_PML; 6370 msrs->ept_caps |= VMX_EPT_AD_BIT; 6371 } 6372 } 6373 6374 if (cpu_has_vmx_vmfunc()) { 6375 msrs->secondary_ctls_high |= 6376 SECONDARY_EXEC_ENABLE_VMFUNC; 6377 /* 6378 * Advertise EPTP switching unconditionally 6379 * since we emulate it 6380 */ 6381 if (enable_ept) 6382 msrs->vmfunc_controls = 6383 VMX_VMFUNC_EPTP_SWITCHING; 6384 } 6385 6386 /* 6387 * Old versions of KVM use the single-context version without 6388 * checking for support, so declare that it is supported even 6389 * though it is treated as global context. The alternative is 6390 * not failing the single-context invvpid, and it is worse. 6391 */ 6392 if (enable_vpid) { 6393 msrs->secondary_ctls_high |= 6394 SECONDARY_EXEC_ENABLE_VPID; 6395 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6396 VMX_VPID_EXTENT_SUPPORTED_MASK; 6397 } 6398 6399 if (enable_unrestricted_guest) 6400 msrs->secondary_ctls_high |= 6401 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6402 6403 if (flexpriority_enabled) 6404 msrs->secondary_ctls_high |= 6405 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6406 6407 /* miscellaneous data */ 6408 rdmsr(MSR_IA32_VMX_MISC, 6409 msrs->misc_low, 6410 msrs->misc_high); 6411 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6412 msrs->misc_low |= 6413 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6414 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6415 VMX_MISC_ACTIVITY_HLT; 6416 msrs->misc_high = 0; 6417 6418 /* 6419 * This MSR reports some information about VMX support. We 6420 * should return information about the VMX we emulate for the 6421 * guest, and the VMCS structure we give it - not about the 6422 * VMX support of the underlying hardware. 6423 */ 6424 msrs->basic = 6425 VMCS12_REVISION | 6426 VMX_BASIC_TRUE_CTLS | 6427 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6428 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6429 6430 if (cpu_has_vmx_basic_inout()) 6431 msrs->basic |= VMX_BASIC_INOUT; 6432 6433 /* 6434 * These MSRs specify bits which the guest must keep fixed on 6435 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6436 * We picked the standard core2 setting. 6437 */ 6438 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6439 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6440 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6441 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6442 6443 /* These MSRs specify bits which the guest must keep fixed off. */ 6444 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6445 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6446 6447 /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 6448 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; 6449 } 6450 6451 void nested_vmx_hardware_unsetup(void) 6452 { 6453 int i; 6454 6455 if (enable_shadow_vmcs) { 6456 for (i = 0; i < VMX_BITMAP_NR; i++) 6457 free_page((unsigned long)vmx_bitmap[i]); 6458 } 6459 } 6460 6461 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6462 { 6463 int i; 6464 6465 if (!cpu_has_vmx_shadow_vmcs()) 6466 enable_shadow_vmcs = 0; 6467 if (enable_shadow_vmcs) { 6468 for (i = 0; i < VMX_BITMAP_NR; i++) { 6469 /* 6470 * The vmx_bitmap is not tied to a VM and so should 6471 * not be charged to a memcg. 6472 */ 6473 vmx_bitmap[i] = (unsigned long *) 6474 __get_free_page(GFP_KERNEL); 6475 if (!vmx_bitmap[i]) { 6476 nested_vmx_hardware_unsetup(); 6477 return -ENOMEM; 6478 } 6479 } 6480 6481 init_vmcs_shadow_fields(); 6482 } 6483 6484 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6485 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6486 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6487 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6488 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6489 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6490 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6491 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; 6492 exit_handlers[EXIT_REASON_VMON] = handle_vmon; 6493 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6494 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6495 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6496 6497 return 0; 6498 } 6499 6500 struct kvm_x86_nested_ops vmx_nested_ops = { 6501 .check_events = vmx_check_nested_events, 6502 .hv_timer_pending = nested_vmx_preemption_timer_pending, 6503 .get_state = vmx_get_nested_state, 6504 .set_state = vmx_set_nested_state, 6505 .get_vmcs12_pages = nested_get_vmcs12_pages, 6506 .enable_evmcs = nested_enable_evmcs, 6507 .get_evmcs_version = nested_get_evmcs_version, 6508 }; 6509