1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/frame.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "pmu.h" 14 #include "trace.h" 15 #include "x86.h" 16 17 static bool __read_mostly enable_shadow_vmcs = 1; 18 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 19 20 static bool __read_mostly nested_early_check = 0; 21 module_param(nested_early_check, bool, S_IRUGO); 22 23 #define CC(consistency_check) \ 24 ({ \ 25 bool failed = (consistency_check); \ 26 if (failed) \ 27 trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ 28 failed; \ 29 }) 30 31 #define SET_MSR_OR_WARN(vcpu, idx, data) \ 32 ({ \ 33 bool failed = kvm_set_msr(vcpu, idx, data); \ 34 if (failed) \ 35 pr_warn_ratelimited( \ 36 "%s cannot write MSR (0x%x, 0x%llx)\n", \ 37 __func__, idx, data); \ 38 failed; \ 39 }) 40 41 /* 42 * Hyper-V requires all of these, so mark them as supported even though 43 * they are just treated the same as all-context. 44 */ 45 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 46 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 47 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 48 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 49 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 50 51 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 52 53 enum { 54 VMX_VMREAD_BITMAP, 55 VMX_VMWRITE_BITMAP, 56 VMX_BITMAP_NR 57 }; 58 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 59 60 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 61 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 62 63 struct shadow_vmcs_field { 64 u16 encoding; 65 u16 offset; 66 }; 67 static struct shadow_vmcs_field shadow_read_only_fields[] = { 68 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 69 #include "vmcs_shadow_fields.h" 70 }; 71 static int max_shadow_read_only_fields = 72 ARRAY_SIZE(shadow_read_only_fields); 73 74 static struct shadow_vmcs_field shadow_read_write_fields[] = { 75 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 76 #include "vmcs_shadow_fields.h" 77 }; 78 static int max_shadow_read_write_fields = 79 ARRAY_SIZE(shadow_read_write_fields); 80 81 static void init_vmcs_shadow_fields(void) 82 { 83 int i, j; 84 85 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 86 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 87 88 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 89 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 90 u16 field = entry.encoding; 91 92 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 93 (i + 1 == max_shadow_read_only_fields || 94 shadow_read_only_fields[i + 1].encoding != field + 1)) 95 pr_err("Missing field from shadow_read_only_field %x\n", 96 field + 1); 97 98 clear_bit(field, vmx_vmread_bitmap); 99 if (field & 1) 100 #ifdef CONFIG_X86_64 101 continue; 102 #else 103 entry.offset += sizeof(u32); 104 #endif 105 shadow_read_only_fields[j++] = entry; 106 } 107 max_shadow_read_only_fields = j; 108 109 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 110 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 111 u16 field = entry.encoding; 112 113 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 114 (i + 1 == max_shadow_read_write_fields || 115 shadow_read_write_fields[i + 1].encoding != field + 1)) 116 pr_err("Missing field from shadow_read_write_field %x\n", 117 field + 1); 118 119 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 120 field <= GUEST_TR_AR_BYTES, 121 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 122 123 /* 124 * PML and the preemption timer can be emulated, but the 125 * processor cannot vmwrite to fields that don't exist 126 * on bare metal. 127 */ 128 switch (field) { 129 case GUEST_PML_INDEX: 130 if (!cpu_has_vmx_pml()) 131 continue; 132 break; 133 case VMX_PREEMPTION_TIMER_VALUE: 134 if (!cpu_has_vmx_preemption_timer()) 135 continue; 136 break; 137 case GUEST_INTR_STATUS: 138 if (!cpu_has_vmx_apicv()) 139 continue; 140 break; 141 default: 142 break; 143 } 144 145 clear_bit(field, vmx_vmwrite_bitmap); 146 clear_bit(field, vmx_vmread_bitmap); 147 if (field & 1) 148 #ifdef CONFIG_X86_64 149 continue; 150 #else 151 entry.offset += sizeof(u32); 152 #endif 153 shadow_read_write_fields[j++] = entry; 154 } 155 max_shadow_read_write_fields = j; 156 } 157 158 /* 159 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 160 * set the success or error code of an emulated VMX instruction (as specified 161 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 162 * instruction. 163 */ 164 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 165 { 166 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 167 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 168 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 169 return kvm_skip_emulated_instruction(vcpu); 170 } 171 172 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 173 { 174 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 175 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 176 X86_EFLAGS_SF | X86_EFLAGS_OF)) 177 | X86_EFLAGS_CF); 178 return kvm_skip_emulated_instruction(vcpu); 179 } 180 181 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 182 u32 vm_instruction_error) 183 { 184 struct vcpu_vmx *vmx = to_vmx(vcpu); 185 186 /* 187 * failValid writes the error number to the current VMCS, which 188 * can't be done if there isn't a current VMCS. 189 */ 190 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) 191 return nested_vmx_failInvalid(vcpu); 192 193 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 194 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 195 X86_EFLAGS_SF | X86_EFLAGS_OF)) 196 | X86_EFLAGS_ZF); 197 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 198 /* 199 * We don't need to force a shadow sync because 200 * VM_INSTRUCTION_ERROR is not shadowed 201 */ 202 return kvm_skip_emulated_instruction(vcpu); 203 } 204 205 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 206 { 207 /* TODO: not to reset guest simply here. */ 208 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 209 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 210 } 211 212 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 213 { 214 return fixed_bits_valid(control, low, high); 215 } 216 217 static inline u64 vmx_control_msr(u32 low, u32 high) 218 { 219 return low | ((u64)high << 32); 220 } 221 222 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 223 { 224 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 225 vmcs_write64(VMCS_LINK_POINTER, -1ull); 226 vmx->nested.need_vmcs12_to_shadow_sync = false; 227 } 228 229 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 230 { 231 struct vcpu_vmx *vmx = to_vmx(vcpu); 232 233 if (!vmx->nested.hv_evmcs) 234 return; 235 236 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 237 vmx->nested.hv_evmcs_vmptr = -1ull; 238 vmx->nested.hv_evmcs = NULL; 239 } 240 241 /* 242 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 243 * just stops using VMX. 244 */ 245 static void free_nested(struct kvm_vcpu *vcpu) 246 { 247 struct vcpu_vmx *vmx = to_vmx(vcpu); 248 249 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 250 return; 251 252 kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 253 254 vmx->nested.vmxon = false; 255 vmx->nested.smm.vmxon = false; 256 free_vpid(vmx->nested.vpid02); 257 vmx->nested.posted_intr_nv = -1; 258 vmx->nested.current_vmptr = -1ull; 259 if (enable_shadow_vmcs) { 260 vmx_disable_shadow_vmcs(vmx); 261 vmcs_clear(vmx->vmcs01.shadow_vmcs); 262 free_vmcs(vmx->vmcs01.shadow_vmcs); 263 vmx->vmcs01.shadow_vmcs = NULL; 264 } 265 kfree(vmx->nested.cached_vmcs12); 266 vmx->nested.cached_vmcs12 = NULL; 267 kfree(vmx->nested.cached_shadow_vmcs12); 268 vmx->nested.cached_shadow_vmcs12 = NULL; 269 /* Unpin physical memory we referred to in the vmcs02 */ 270 if (vmx->nested.apic_access_page) { 271 kvm_release_page_clean(vmx->nested.apic_access_page); 272 vmx->nested.apic_access_page = NULL; 273 } 274 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 275 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 276 vmx->nested.pi_desc = NULL; 277 278 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 279 280 nested_release_evmcs(vcpu); 281 282 free_loaded_vmcs(&vmx->nested.vmcs02); 283 } 284 285 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 286 struct loaded_vmcs *prev) 287 { 288 struct vmcs_host_state *dest, *src; 289 290 if (unlikely(!vmx->guest_state_loaded)) 291 return; 292 293 src = &prev->host_state; 294 dest = &vmx->loaded_vmcs->host_state; 295 296 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 297 dest->ldt_sel = src->ldt_sel; 298 #ifdef CONFIG_X86_64 299 dest->ds_sel = src->ds_sel; 300 dest->es_sel = src->es_sel; 301 #endif 302 } 303 304 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 305 { 306 struct vcpu_vmx *vmx = to_vmx(vcpu); 307 struct loaded_vmcs *prev; 308 int cpu; 309 310 if (vmx->loaded_vmcs == vmcs) 311 return; 312 313 cpu = get_cpu(); 314 prev = vmx->loaded_vmcs; 315 vmx->loaded_vmcs = vmcs; 316 vmx_vcpu_load_vmcs(vcpu, cpu); 317 vmx_sync_vmcs_host_state(vmx, prev); 318 put_cpu(); 319 320 vmx_segment_cache_clear(vmx); 321 } 322 323 /* 324 * Ensure that the current vmcs of the logical processor is the 325 * vmcs01 of the vcpu before calling free_nested(). 326 */ 327 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 328 { 329 vcpu_load(vcpu); 330 vmx_leave_nested(vcpu); 331 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); 332 free_nested(vcpu); 333 vcpu_put(vcpu); 334 } 335 336 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 337 struct x86_exception *fault) 338 { 339 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 340 struct vcpu_vmx *vmx = to_vmx(vcpu); 341 u32 exit_reason; 342 unsigned long exit_qualification = vcpu->arch.exit_qualification; 343 344 if (vmx->nested.pml_full) { 345 exit_reason = EXIT_REASON_PML_FULL; 346 vmx->nested.pml_full = false; 347 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 348 } else if (fault->error_code & PFERR_RSVD_MASK) 349 exit_reason = EXIT_REASON_EPT_MISCONFIG; 350 else 351 exit_reason = EXIT_REASON_EPT_VIOLATION; 352 353 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); 354 vmcs12->guest_physical_address = fault->address; 355 } 356 357 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 358 { 359 WARN_ON(mmu_is_nested(vcpu)); 360 361 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 362 kvm_init_shadow_ept_mmu(vcpu, 363 to_vmx(vcpu)->nested.msrs.ept_caps & 364 VMX_EPT_EXECUTE_ONLY_BIT, 365 nested_ept_ad_enabled(vcpu), 366 nested_ept_get_cr3(vcpu)); 367 vcpu->arch.mmu->set_cr3 = vmx_set_cr3; 368 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; 369 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 370 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 371 372 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 373 } 374 375 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 376 { 377 vcpu->arch.mmu = &vcpu->arch.root_mmu; 378 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 379 } 380 381 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 382 u16 error_code) 383 { 384 bool inequality, bit; 385 386 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 387 inequality = 388 (error_code & vmcs12->page_fault_error_code_mask) != 389 vmcs12->page_fault_error_code_match; 390 return inequality ^ bit; 391 } 392 393 394 /* 395 * KVM wants to inject page-faults which it got to the guest. This function 396 * checks whether in a nested guest, we need to inject them to L1 or L2. 397 */ 398 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 399 { 400 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 401 unsigned int nr = vcpu->arch.exception.nr; 402 bool has_payload = vcpu->arch.exception.has_payload; 403 unsigned long payload = vcpu->arch.exception.payload; 404 405 if (nr == PF_VECTOR) { 406 if (vcpu->arch.exception.nested_apf) { 407 *exit_qual = vcpu->arch.apf.nested_apf_token; 408 return 1; 409 } 410 if (nested_vmx_is_page_fault_vmexit(vmcs12, 411 vcpu->arch.exception.error_code)) { 412 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 413 return 1; 414 } 415 } else if (vmcs12->exception_bitmap & (1u << nr)) { 416 if (nr == DB_VECTOR) { 417 if (!has_payload) { 418 payload = vcpu->arch.dr6; 419 payload &= ~(DR6_FIXED_1 | DR6_BT); 420 payload ^= DR6_RTM; 421 } 422 *exit_qual = payload; 423 } else 424 *exit_qual = 0; 425 return 1; 426 } 427 428 return 0; 429 } 430 431 432 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 433 struct x86_exception *fault) 434 { 435 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 436 437 WARN_ON(!is_guest_mode(vcpu)); 438 439 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 440 !to_vmx(vcpu)->nested.nested_run_pending) { 441 vmcs12->vm_exit_intr_error_code = fault->error_code; 442 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 443 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 444 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 445 fault->address); 446 } else { 447 kvm_inject_page_fault(vcpu, fault); 448 } 449 } 450 451 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) 452 { 453 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); 454 } 455 456 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 457 struct vmcs12 *vmcs12) 458 { 459 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 460 return 0; 461 462 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 463 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 464 return -EINVAL; 465 466 return 0; 467 } 468 469 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 470 struct vmcs12 *vmcs12) 471 { 472 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 473 return 0; 474 475 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 476 return -EINVAL; 477 478 return 0; 479 } 480 481 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 482 struct vmcs12 *vmcs12) 483 { 484 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 485 return 0; 486 487 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 488 return -EINVAL; 489 490 return 0; 491 } 492 493 /* 494 * Check if MSR is intercepted for L01 MSR bitmap. 495 */ 496 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) 497 { 498 unsigned long *msr_bitmap; 499 int f = sizeof(unsigned long); 500 501 if (!cpu_has_vmx_msr_bitmap()) 502 return true; 503 504 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 505 506 if (msr <= 0x1fff) { 507 return !!test_bit(msr, msr_bitmap + 0x800 / f); 508 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 509 msr &= 0x1fff; 510 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 511 } 512 513 return true; 514 } 515 516 /* 517 * If a msr is allowed by L0, we should check whether it is allowed by L1. 518 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 519 */ 520 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 521 unsigned long *msr_bitmap_nested, 522 u32 msr, int type) 523 { 524 int f = sizeof(unsigned long); 525 526 /* 527 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 528 * have the write-low and read-high bitmap offsets the wrong way round. 529 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 530 */ 531 if (msr <= 0x1fff) { 532 if (type & MSR_TYPE_R && 533 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 534 /* read-low */ 535 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 536 537 if (type & MSR_TYPE_W && 538 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 539 /* write-low */ 540 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 541 542 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 543 msr &= 0x1fff; 544 if (type & MSR_TYPE_R && 545 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 546 /* read-high */ 547 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 548 549 if (type & MSR_TYPE_W && 550 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 551 /* write-high */ 552 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 553 554 } 555 } 556 557 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { 558 int msr; 559 560 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 561 unsigned word = msr / BITS_PER_LONG; 562 563 msr_bitmap[word] = ~0; 564 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 565 } 566 } 567 568 /* 569 * Merge L0's and L1's MSR bitmap, return false to indicate that 570 * we do not use the hardware. 571 */ 572 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 573 struct vmcs12 *vmcs12) 574 { 575 int msr; 576 unsigned long *msr_bitmap_l1; 577 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 578 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; 579 580 /* Nothing to do if the MSR bitmap is not in use. */ 581 if (!cpu_has_vmx_msr_bitmap() || 582 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 583 return false; 584 585 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 586 return false; 587 588 msr_bitmap_l1 = (unsigned long *)map->hva; 589 590 /* 591 * To keep the control flow simple, pay eight 8-byte writes (sixteen 592 * 4-byte writes on 32-bit systems) up front to enable intercepts for 593 * the x2APIC MSR range and selectively disable them below. 594 */ 595 enable_x2apic_msr_intercepts(msr_bitmap_l0); 596 597 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 598 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 599 /* 600 * L0 need not intercept reads for MSRs between 0x800 601 * and 0x8ff, it just lets the processor take the value 602 * from the virtual-APIC page; take those 256 bits 603 * directly from the L1 bitmap. 604 */ 605 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 606 unsigned word = msr / BITS_PER_LONG; 607 608 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 609 } 610 } 611 612 nested_vmx_disable_intercept_for_msr( 613 msr_bitmap_l1, msr_bitmap_l0, 614 X2APIC_MSR(APIC_TASKPRI), 615 MSR_TYPE_R | MSR_TYPE_W); 616 617 if (nested_cpu_has_vid(vmcs12)) { 618 nested_vmx_disable_intercept_for_msr( 619 msr_bitmap_l1, msr_bitmap_l0, 620 X2APIC_MSR(APIC_EOI), 621 MSR_TYPE_W); 622 nested_vmx_disable_intercept_for_msr( 623 msr_bitmap_l1, msr_bitmap_l0, 624 X2APIC_MSR(APIC_SELF_IPI), 625 MSR_TYPE_W); 626 } 627 } 628 629 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ 630 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 631 MSR_FS_BASE, MSR_TYPE_RW); 632 633 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 634 MSR_GS_BASE, MSR_TYPE_RW); 635 636 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 637 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 638 639 /* 640 * Checking the L0->L1 bitmap is trying to verify two things: 641 * 642 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This 643 * ensures that we do not accidentally generate an L02 MSR bitmap 644 * from the L12 MSR bitmap that is too permissive. 645 * 2. That L1 or L2s have actually used the MSR. This avoids 646 * unnecessarily merging of the bitmap if the MSR is unused. This 647 * works properly because we only update the L01 MSR bitmap lazily. 648 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only 649 * updated to reflect this when L1 (or its L2s) actually write to 650 * the MSR. 651 */ 652 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) 653 nested_vmx_disable_intercept_for_msr( 654 msr_bitmap_l1, msr_bitmap_l0, 655 MSR_IA32_SPEC_CTRL, 656 MSR_TYPE_R | MSR_TYPE_W); 657 658 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) 659 nested_vmx_disable_intercept_for_msr( 660 msr_bitmap_l1, msr_bitmap_l0, 661 MSR_IA32_PRED_CMD, 662 MSR_TYPE_W); 663 664 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); 665 666 return true; 667 } 668 669 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 670 struct vmcs12 *vmcs12) 671 { 672 struct kvm_host_map map; 673 struct vmcs12 *shadow; 674 675 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 676 vmcs12->vmcs_link_pointer == -1ull) 677 return; 678 679 shadow = get_shadow_vmcs12(vcpu); 680 681 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 682 return; 683 684 memcpy(shadow, map.hva, VMCS12_SIZE); 685 kvm_vcpu_unmap(vcpu, &map, false); 686 } 687 688 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 689 struct vmcs12 *vmcs12) 690 { 691 struct vcpu_vmx *vmx = to_vmx(vcpu); 692 693 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 694 vmcs12->vmcs_link_pointer == -1ull) 695 return; 696 697 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 698 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 699 } 700 701 /* 702 * In nested virtualization, check if L1 has set 703 * VM_EXIT_ACK_INTR_ON_EXIT 704 */ 705 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 706 { 707 return get_vmcs12(vcpu)->vm_exit_controls & 708 VM_EXIT_ACK_INTR_ON_EXIT; 709 } 710 711 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 712 { 713 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); 714 } 715 716 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 717 struct vmcs12 *vmcs12) 718 { 719 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 720 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 721 return -EINVAL; 722 else 723 return 0; 724 } 725 726 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 727 struct vmcs12 *vmcs12) 728 { 729 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 730 !nested_cpu_has_apic_reg_virt(vmcs12) && 731 !nested_cpu_has_vid(vmcs12) && 732 !nested_cpu_has_posted_intr(vmcs12)) 733 return 0; 734 735 /* 736 * If virtualize x2apic mode is enabled, 737 * virtualize apic access must be disabled. 738 */ 739 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 740 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 741 return -EINVAL; 742 743 /* 744 * If virtual interrupt delivery is enabled, 745 * we must exit on external interrupts. 746 */ 747 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 748 return -EINVAL; 749 750 /* 751 * bits 15:8 should be zero in posted_intr_nv, 752 * the descriptor address has been already checked 753 * in nested_get_vmcs12_pages. 754 * 755 * bits 5:0 of posted_intr_desc_addr should be zero. 756 */ 757 if (nested_cpu_has_posted_intr(vmcs12) && 758 (CC(!nested_cpu_has_vid(vmcs12)) || 759 CC(!nested_exit_intr_ack_set(vcpu)) || 760 CC((vmcs12->posted_intr_nv & 0xff00)) || 761 CC((vmcs12->posted_intr_desc_addr & 0x3f)) || 762 CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) 763 return -EINVAL; 764 765 /* tpr shadow is needed by all apicv features. */ 766 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 767 return -EINVAL; 768 769 return 0; 770 } 771 772 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 773 u32 count, u64 addr) 774 { 775 int maxphyaddr; 776 777 if (count == 0) 778 return 0; 779 maxphyaddr = cpuid_maxphyaddr(vcpu); 780 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 781 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) 782 return -EINVAL; 783 784 return 0; 785 } 786 787 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 788 struct vmcs12 *vmcs12) 789 { 790 if (CC(nested_vmx_check_msr_switch(vcpu, 791 vmcs12->vm_exit_msr_load_count, 792 vmcs12->vm_exit_msr_load_addr)) || 793 CC(nested_vmx_check_msr_switch(vcpu, 794 vmcs12->vm_exit_msr_store_count, 795 vmcs12->vm_exit_msr_store_addr))) 796 return -EINVAL; 797 798 return 0; 799 } 800 801 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 802 struct vmcs12 *vmcs12) 803 { 804 if (CC(nested_vmx_check_msr_switch(vcpu, 805 vmcs12->vm_entry_msr_load_count, 806 vmcs12->vm_entry_msr_load_addr))) 807 return -EINVAL; 808 809 return 0; 810 } 811 812 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 813 struct vmcs12 *vmcs12) 814 { 815 if (!nested_cpu_has_pml(vmcs12)) 816 return 0; 817 818 if (CC(!nested_cpu_has_ept(vmcs12)) || 819 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 820 return -EINVAL; 821 822 return 0; 823 } 824 825 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 826 struct vmcs12 *vmcs12) 827 { 828 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 829 !nested_cpu_has_ept(vmcs12))) 830 return -EINVAL; 831 return 0; 832 } 833 834 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 835 struct vmcs12 *vmcs12) 836 { 837 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 838 !nested_cpu_has_ept(vmcs12))) 839 return -EINVAL; 840 return 0; 841 } 842 843 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 844 struct vmcs12 *vmcs12) 845 { 846 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 847 return 0; 848 849 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 850 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 851 return -EINVAL; 852 853 return 0; 854 } 855 856 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 857 struct vmx_msr_entry *e) 858 { 859 /* x2APIC MSR accesses are not allowed */ 860 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 861 return -EINVAL; 862 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 863 CC(e->index == MSR_IA32_UCODE_REV)) 864 return -EINVAL; 865 if (CC(e->reserved != 0)) 866 return -EINVAL; 867 return 0; 868 } 869 870 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 871 struct vmx_msr_entry *e) 872 { 873 if (CC(e->index == MSR_FS_BASE) || 874 CC(e->index == MSR_GS_BASE) || 875 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 876 nested_vmx_msr_check_common(vcpu, e)) 877 return -EINVAL; 878 return 0; 879 } 880 881 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 882 struct vmx_msr_entry *e) 883 { 884 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 885 nested_vmx_msr_check_common(vcpu, e)) 886 return -EINVAL; 887 return 0; 888 } 889 890 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 891 { 892 struct vcpu_vmx *vmx = to_vmx(vcpu); 893 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 894 vmx->nested.msrs.misc_high); 895 896 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 897 } 898 899 /* 900 * Load guest's/host's msr at nested entry/exit. 901 * return 0 for success, entry index for failure. 902 * 903 * One of the failure modes for MSR load/store is when a list exceeds the 904 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 905 * as possible, process all valid entries before failing rather than precheck 906 * for a capacity violation. 907 */ 908 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 909 { 910 u32 i; 911 struct vmx_msr_entry e; 912 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 913 914 for (i = 0; i < count; i++) { 915 if (unlikely(i >= max_msr_list_size)) 916 goto fail; 917 918 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 919 &e, sizeof(e))) { 920 pr_debug_ratelimited( 921 "%s cannot read MSR entry (%u, 0x%08llx)\n", 922 __func__, i, gpa + i * sizeof(e)); 923 goto fail; 924 } 925 if (nested_vmx_load_msr_check(vcpu, &e)) { 926 pr_debug_ratelimited( 927 "%s check failed (%u, 0x%x, 0x%x)\n", 928 __func__, i, e.index, e.reserved); 929 goto fail; 930 } 931 if (kvm_set_msr(vcpu, e.index, e.value)) { 932 pr_debug_ratelimited( 933 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 934 __func__, i, e.index, e.value); 935 goto fail; 936 } 937 } 938 return 0; 939 fail: 940 return i + 1; 941 } 942 943 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 944 u32 msr_index, 945 u64 *data) 946 { 947 struct vcpu_vmx *vmx = to_vmx(vcpu); 948 949 /* 950 * If the L0 hypervisor stored a more accurate value for the TSC that 951 * does not include the time taken for emulation of the L2->L1 952 * VM-exit in L0, use the more accurate value. 953 */ 954 if (msr_index == MSR_IA32_TSC) { 955 int index = vmx_find_msr_index(&vmx->msr_autostore.guest, 956 MSR_IA32_TSC); 957 958 if (index >= 0) { 959 u64 val = vmx->msr_autostore.guest.val[index].value; 960 961 *data = kvm_read_l1_tsc(vcpu, val); 962 return true; 963 } 964 } 965 966 if (kvm_get_msr(vcpu, msr_index, data)) { 967 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 968 msr_index); 969 return false; 970 } 971 return true; 972 } 973 974 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 975 struct vmx_msr_entry *e) 976 { 977 if (kvm_vcpu_read_guest(vcpu, 978 gpa + i * sizeof(*e), 979 e, 2 * sizeof(u32))) { 980 pr_debug_ratelimited( 981 "%s cannot read MSR entry (%u, 0x%08llx)\n", 982 __func__, i, gpa + i * sizeof(*e)); 983 return false; 984 } 985 if (nested_vmx_store_msr_check(vcpu, e)) { 986 pr_debug_ratelimited( 987 "%s check failed (%u, 0x%x, 0x%x)\n", 988 __func__, i, e->index, e->reserved); 989 return false; 990 } 991 return true; 992 } 993 994 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 995 { 996 u64 data; 997 u32 i; 998 struct vmx_msr_entry e; 999 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1000 1001 for (i = 0; i < count; i++) { 1002 if (unlikely(i >= max_msr_list_size)) 1003 return -EINVAL; 1004 1005 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1006 return -EINVAL; 1007 1008 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1009 return -EINVAL; 1010 1011 if (kvm_vcpu_write_guest(vcpu, 1012 gpa + i * sizeof(e) + 1013 offsetof(struct vmx_msr_entry, value), 1014 &data, sizeof(data))) { 1015 pr_debug_ratelimited( 1016 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1017 __func__, i, e.index, data); 1018 return -EINVAL; 1019 } 1020 } 1021 return 0; 1022 } 1023 1024 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1025 { 1026 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1027 u32 count = vmcs12->vm_exit_msr_store_count; 1028 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1029 struct vmx_msr_entry e; 1030 u32 i; 1031 1032 for (i = 0; i < count; i++) { 1033 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1034 return false; 1035 1036 if (e.index == msr_index) 1037 return true; 1038 } 1039 return false; 1040 } 1041 1042 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1043 u32 msr_index) 1044 { 1045 struct vcpu_vmx *vmx = to_vmx(vcpu); 1046 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1047 bool in_vmcs12_store_list; 1048 int msr_autostore_index; 1049 bool in_autostore_list; 1050 int last; 1051 1052 msr_autostore_index = vmx_find_msr_index(autostore, msr_index); 1053 in_autostore_list = msr_autostore_index >= 0; 1054 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1055 1056 if (in_vmcs12_store_list && !in_autostore_list) { 1057 if (autostore->nr == NR_LOADSTORE_MSRS) { 1058 /* 1059 * Emulated VMEntry does not fail here. Instead a less 1060 * accurate value will be returned by 1061 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1062 * instead of reading the value from the vmcs02 VMExit 1063 * MSR-store area. 1064 */ 1065 pr_warn_ratelimited( 1066 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1067 msr_index); 1068 return; 1069 } 1070 last = autostore->nr++; 1071 autostore->val[last].index = msr_index; 1072 } else if (!in_vmcs12_store_list && in_autostore_list) { 1073 last = --autostore->nr; 1074 autostore->val[msr_autostore_index] = autostore->val[last]; 1075 } 1076 } 1077 1078 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) 1079 { 1080 unsigned long invalid_mask; 1081 1082 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); 1083 return (val & invalid_mask) == 0; 1084 } 1085 1086 /* 1087 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are 1088 * emulating VM entry into a guest with EPT enabled. 1089 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 1090 * is assigned to entry_failure_code on failure. 1091 */ 1092 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, 1093 u32 *entry_failure_code) 1094 { 1095 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { 1096 if (CC(!nested_cr3_valid(vcpu, cr3))) { 1097 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1098 return -EINVAL; 1099 } 1100 1101 /* 1102 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1103 * must not be dereferenced. 1104 */ 1105 if (is_pae_paging(vcpu) && !nested_ept) { 1106 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1107 *entry_failure_code = ENTRY_FAIL_PDPTE; 1108 return -EINVAL; 1109 } 1110 } 1111 } 1112 1113 if (!nested_ept) 1114 kvm_mmu_new_cr3(vcpu, cr3, false); 1115 1116 vcpu->arch.cr3 = cr3; 1117 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 1118 1119 kvm_init_mmu(vcpu, false); 1120 1121 return 0; 1122 } 1123 1124 /* 1125 * Returns if KVM is able to config CPU to tag TLB entries 1126 * populated by L2 differently than TLB entries populated 1127 * by L1. 1128 * 1129 * If L0 uses EPT, L1 and L2 run with different EPTP because 1130 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1131 * are tagged with different EPTP. 1132 * 1133 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1134 * with different VPID (L1 entries are tagged with vmx->vpid 1135 * while L2 entries are tagged with vmx->nested.vpid02). 1136 */ 1137 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1138 { 1139 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1140 1141 return enable_ept || 1142 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1143 } 1144 1145 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) 1146 { 1147 struct vcpu_vmx *vmx = to_vmx(vcpu); 1148 1149 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; 1150 } 1151 1152 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1153 { 1154 superset &= mask; 1155 subset &= mask; 1156 1157 return (superset | subset) == superset; 1158 } 1159 1160 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1161 { 1162 const u64 feature_and_reserved = 1163 /* feature (except bit 48; see below) */ 1164 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1165 /* reserved */ 1166 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1167 u64 vmx_basic = vmx->nested.msrs.basic; 1168 1169 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1170 return -EINVAL; 1171 1172 /* 1173 * KVM does not emulate a version of VMX that constrains physical 1174 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1175 */ 1176 if (data & BIT_ULL(48)) 1177 return -EINVAL; 1178 1179 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1180 vmx_basic_vmcs_revision_id(data)) 1181 return -EINVAL; 1182 1183 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1184 return -EINVAL; 1185 1186 vmx->nested.msrs.basic = data; 1187 return 0; 1188 } 1189 1190 static int 1191 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1192 { 1193 u64 supported; 1194 u32 *lowp, *highp; 1195 1196 switch (msr_index) { 1197 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1198 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1199 highp = &vmx->nested.msrs.pinbased_ctls_high; 1200 break; 1201 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1202 lowp = &vmx->nested.msrs.procbased_ctls_low; 1203 highp = &vmx->nested.msrs.procbased_ctls_high; 1204 break; 1205 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1206 lowp = &vmx->nested.msrs.exit_ctls_low; 1207 highp = &vmx->nested.msrs.exit_ctls_high; 1208 break; 1209 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1210 lowp = &vmx->nested.msrs.entry_ctls_low; 1211 highp = &vmx->nested.msrs.entry_ctls_high; 1212 break; 1213 case MSR_IA32_VMX_PROCBASED_CTLS2: 1214 lowp = &vmx->nested.msrs.secondary_ctls_low; 1215 highp = &vmx->nested.msrs.secondary_ctls_high; 1216 break; 1217 default: 1218 BUG(); 1219 } 1220 1221 supported = vmx_control_msr(*lowp, *highp); 1222 1223 /* Check must-be-1 bits are still 1. */ 1224 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1225 return -EINVAL; 1226 1227 /* Check must-be-0 bits are still 0. */ 1228 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1229 return -EINVAL; 1230 1231 *lowp = data; 1232 *highp = data >> 32; 1233 return 0; 1234 } 1235 1236 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1237 { 1238 const u64 feature_and_reserved_bits = 1239 /* feature */ 1240 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1241 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1242 /* reserved */ 1243 GENMASK_ULL(13, 9) | BIT_ULL(31); 1244 u64 vmx_misc; 1245 1246 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1247 vmx->nested.msrs.misc_high); 1248 1249 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1250 return -EINVAL; 1251 1252 if ((vmx->nested.msrs.pinbased_ctls_high & 1253 PIN_BASED_VMX_PREEMPTION_TIMER) && 1254 vmx_misc_preemption_timer_rate(data) != 1255 vmx_misc_preemption_timer_rate(vmx_misc)) 1256 return -EINVAL; 1257 1258 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1259 return -EINVAL; 1260 1261 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1262 return -EINVAL; 1263 1264 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1265 return -EINVAL; 1266 1267 vmx->nested.msrs.misc_low = data; 1268 vmx->nested.msrs.misc_high = data >> 32; 1269 1270 return 0; 1271 } 1272 1273 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1274 { 1275 u64 vmx_ept_vpid_cap; 1276 1277 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1278 vmx->nested.msrs.vpid_caps); 1279 1280 /* Every bit is either reserved or a feature bit. */ 1281 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1282 return -EINVAL; 1283 1284 vmx->nested.msrs.ept_caps = data; 1285 vmx->nested.msrs.vpid_caps = data >> 32; 1286 return 0; 1287 } 1288 1289 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1290 { 1291 u64 *msr; 1292 1293 switch (msr_index) { 1294 case MSR_IA32_VMX_CR0_FIXED0: 1295 msr = &vmx->nested.msrs.cr0_fixed0; 1296 break; 1297 case MSR_IA32_VMX_CR4_FIXED0: 1298 msr = &vmx->nested.msrs.cr4_fixed0; 1299 break; 1300 default: 1301 BUG(); 1302 } 1303 1304 /* 1305 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1306 * must be 1 in the restored value. 1307 */ 1308 if (!is_bitwise_subset(data, *msr, -1ULL)) 1309 return -EINVAL; 1310 1311 *msr = data; 1312 return 0; 1313 } 1314 1315 /* 1316 * Called when userspace is restoring VMX MSRs. 1317 * 1318 * Returns 0 on success, non-0 otherwise. 1319 */ 1320 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1321 { 1322 struct vcpu_vmx *vmx = to_vmx(vcpu); 1323 1324 /* 1325 * Don't allow changes to the VMX capability MSRs while the vCPU 1326 * is in VMX operation. 1327 */ 1328 if (vmx->nested.vmxon) 1329 return -EBUSY; 1330 1331 switch (msr_index) { 1332 case MSR_IA32_VMX_BASIC: 1333 return vmx_restore_vmx_basic(vmx, data); 1334 case MSR_IA32_VMX_PINBASED_CTLS: 1335 case MSR_IA32_VMX_PROCBASED_CTLS: 1336 case MSR_IA32_VMX_EXIT_CTLS: 1337 case MSR_IA32_VMX_ENTRY_CTLS: 1338 /* 1339 * The "non-true" VMX capability MSRs are generated from the 1340 * "true" MSRs, so we do not support restoring them directly. 1341 * 1342 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1343 * should restore the "true" MSRs with the must-be-1 bits 1344 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1345 * DEFAULT SETTINGS". 1346 */ 1347 return -EINVAL; 1348 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1349 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1350 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1351 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1352 case MSR_IA32_VMX_PROCBASED_CTLS2: 1353 return vmx_restore_control_msr(vmx, msr_index, data); 1354 case MSR_IA32_VMX_MISC: 1355 return vmx_restore_vmx_misc(vmx, data); 1356 case MSR_IA32_VMX_CR0_FIXED0: 1357 case MSR_IA32_VMX_CR4_FIXED0: 1358 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1359 case MSR_IA32_VMX_CR0_FIXED1: 1360 case MSR_IA32_VMX_CR4_FIXED1: 1361 /* 1362 * These MSRs are generated based on the vCPU's CPUID, so we 1363 * do not support restoring them directly. 1364 */ 1365 return -EINVAL; 1366 case MSR_IA32_VMX_EPT_VPID_CAP: 1367 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1368 case MSR_IA32_VMX_VMCS_ENUM: 1369 vmx->nested.msrs.vmcs_enum = data; 1370 return 0; 1371 case MSR_IA32_VMX_VMFUNC: 1372 if (data & ~vmx->nested.msrs.vmfunc_controls) 1373 return -EINVAL; 1374 vmx->nested.msrs.vmfunc_controls = data; 1375 return 0; 1376 default: 1377 /* 1378 * The rest of the VMX capability MSRs do not support restore. 1379 */ 1380 return -EINVAL; 1381 } 1382 } 1383 1384 /* Returns 0 on success, non-0 otherwise. */ 1385 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1386 { 1387 switch (msr_index) { 1388 case MSR_IA32_VMX_BASIC: 1389 *pdata = msrs->basic; 1390 break; 1391 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1392 case MSR_IA32_VMX_PINBASED_CTLS: 1393 *pdata = vmx_control_msr( 1394 msrs->pinbased_ctls_low, 1395 msrs->pinbased_ctls_high); 1396 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1397 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1398 break; 1399 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1400 case MSR_IA32_VMX_PROCBASED_CTLS: 1401 *pdata = vmx_control_msr( 1402 msrs->procbased_ctls_low, 1403 msrs->procbased_ctls_high); 1404 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1405 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1406 break; 1407 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1408 case MSR_IA32_VMX_EXIT_CTLS: 1409 *pdata = vmx_control_msr( 1410 msrs->exit_ctls_low, 1411 msrs->exit_ctls_high); 1412 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1413 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1414 break; 1415 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1416 case MSR_IA32_VMX_ENTRY_CTLS: 1417 *pdata = vmx_control_msr( 1418 msrs->entry_ctls_low, 1419 msrs->entry_ctls_high); 1420 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1421 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1422 break; 1423 case MSR_IA32_VMX_MISC: 1424 *pdata = vmx_control_msr( 1425 msrs->misc_low, 1426 msrs->misc_high); 1427 break; 1428 case MSR_IA32_VMX_CR0_FIXED0: 1429 *pdata = msrs->cr0_fixed0; 1430 break; 1431 case MSR_IA32_VMX_CR0_FIXED1: 1432 *pdata = msrs->cr0_fixed1; 1433 break; 1434 case MSR_IA32_VMX_CR4_FIXED0: 1435 *pdata = msrs->cr4_fixed0; 1436 break; 1437 case MSR_IA32_VMX_CR4_FIXED1: 1438 *pdata = msrs->cr4_fixed1; 1439 break; 1440 case MSR_IA32_VMX_VMCS_ENUM: 1441 *pdata = msrs->vmcs_enum; 1442 break; 1443 case MSR_IA32_VMX_PROCBASED_CTLS2: 1444 *pdata = vmx_control_msr( 1445 msrs->secondary_ctls_low, 1446 msrs->secondary_ctls_high); 1447 break; 1448 case MSR_IA32_VMX_EPT_VPID_CAP: 1449 *pdata = msrs->ept_caps | 1450 ((u64)msrs->vpid_caps << 32); 1451 break; 1452 case MSR_IA32_VMX_VMFUNC: 1453 *pdata = msrs->vmfunc_controls; 1454 break; 1455 default: 1456 return 1; 1457 } 1458 1459 return 0; 1460 } 1461 1462 /* 1463 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1464 * been modified by the L1 guest. Note, "writable" in this context means 1465 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1466 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1467 * VM-exit information fields (which are actually writable if the vCPU is 1468 * configured to support "VMWRITE to any supported field in the VMCS"). 1469 */ 1470 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1471 { 1472 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1473 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1474 struct shadow_vmcs_field field; 1475 unsigned long val; 1476 int i; 1477 1478 if (WARN_ON(!shadow_vmcs)) 1479 return; 1480 1481 preempt_disable(); 1482 1483 vmcs_load(shadow_vmcs); 1484 1485 for (i = 0; i < max_shadow_read_write_fields; i++) { 1486 field = shadow_read_write_fields[i]; 1487 val = __vmcs_readl(field.encoding); 1488 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1489 } 1490 1491 vmcs_clear(shadow_vmcs); 1492 vmcs_load(vmx->loaded_vmcs->vmcs); 1493 1494 preempt_enable(); 1495 } 1496 1497 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1498 { 1499 const struct shadow_vmcs_field *fields[] = { 1500 shadow_read_write_fields, 1501 shadow_read_only_fields 1502 }; 1503 const int max_fields[] = { 1504 max_shadow_read_write_fields, 1505 max_shadow_read_only_fields 1506 }; 1507 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1508 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1509 struct shadow_vmcs_field field; 1510 unsigned long val; 1511 int i, q; 1512 1513 if (WARN_ON(!shadow_vmcs)) 1514 return; 1515 1516 vmcs_load(shadow_vmcs); 1517 1518 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1519 for (i = 0; i < max_fields[q]; i++) { 1520 field = fields[q][i]; 1521 val = vmcs12_read_any(vmcs12, field.encoding, 1522 field.offset); 1523 __vmcs_writel(field.encoding, val); 1524 } 1525 } 1526 1527 vmcs_clear(shadow_vmcs); 1528 vmcs_load(vmx->loaded_vmcs->vmcs); 1529 } 1530 1531 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) 1532 { 1533 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1534 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1535 1536 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1537 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1538 vmcs12->guest_rip = evmcs->guest_rip; 1539 1540 if (unlikely(!(evmcs->hv_clean_fields & 1541 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1542 vmcs12->guest_rsp = evmcs->guest_rsp; 1543 vmcs12->guest_rflags = evmcs->guest_rflags; 1544 vmcs12->guest_interruptibility_info = 1545 evmcs->guest_interruptibility_info; 1546 } 1547 1548 if (unlikely(!(evmcs->hv_clean_fields & 1549 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1550 vmcs12->cpu_based_vm_exec_control = 1551 evmcs->cpu_based_vm_exec_control; 1552 } 1553 1554 if (unlikely(!(evmcs->hv_clean_fields & 1555 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1556 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1557 } 1558 1559 if (unlikely(!(evmcs->hv_clean_fields & 1560 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1561 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1562 } 1563 1564 if (unlikely(!(evmcs->hv_clean_fields & 1565 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1566 vmcs12->vm_entry_intr_info_field = 1567 evmcs->vm_entry_intr_info_field; 1568 vmcs12->vm_entry_exception_error_code = 1569 evmcs->vm_entry_exception_error_code; 1570 vmcs12->vm_entry_instruction_len = 1571 evmcs->vm_entry_instruction_len; 1572 } 1573 1574 if (unlikely(!(evmcs->hv_clean_fields & 1575 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1576 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1577 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1578 vmcs12->host_cr0 = evmcs->host_cr0; 1579 vmcs12->host_cr3 = evmcs->host_cr3; 1580 vmcs12->host_cr4 = evmcs->host_cr4; 1581 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1582 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1583 vmcs12->host_rip = evmcs->host_rip; 1584 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1585 vmcs12->host_es_selector = evmcs->host_es_selector; 1586 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1587 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1588 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1589 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1590 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1591 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1592 } 1593 1594 if (unlikely(!(evmcs->hv_clean_fields & 1595 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1596 vmcs12->pin_based_vm_exec_control = 1597 evmcs->pin_based_vm_exec_control; 1598 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1599 vmcs12->secondary_vm_exec_control = 1600 evmcs->secondary_vm_exec_control; 1601 } 1602 1603 if (unlikely(!(evmcs->hv_clean_fields & 1604 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1605 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1606 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1607 } 1608 1609 if (unlikely(!(evmcs->hv_clean_fields & 1610 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1611 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1612 } 1613 1614 if (unlikely(!(evmcs->hv_clean_fields & 1615 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1616 vmcs12->guest_es_base = evmcs->guest_es_base; 1617 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1618 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1619 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1620 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1621 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1622 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1623 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1624 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1625 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1626 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1627 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1628 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1629 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1630 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1631 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1632 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1633 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1634 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1635 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1636 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1637 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1638 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1639 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1640 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1641 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1642 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1643 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1644 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1645 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1646 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1647 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1648 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1649 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1650 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1651 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1652 } 1653 1654 if (unlikely(!(evmcs->hv_clean_fields & 1655 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1656 vmcs12->tsc_offset = evmcs->tsc_offset; 1657 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1658 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1659 } 1660 1661 if (unlikely(!(evmcs->hv_clean_fields & 1662 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1663 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1664 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1665 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1666 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1667 vmcs12->guest_cr0 = evmcs->guest_cr0; 1668 vmcs12->guest_cr3 = evmcs->guest_cr3; 1669 vmcs12->guest_cr4 = evmcs->guest_cr4; 1670 vmcs12->guest_dr7 = evmcs->guest_dr7; 1671 } 1672 1673 if (unlikely(!(evmcs->hv_clean_fields & 1674 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1675 vmcs12->host_fs_base = evmcs->host_fs_base; 1676 vmcs12->host_gs_base = evmcs->host_gs_base; 1677 vmcs12->host_tr_base = evmcs->host_tr_base; 1678 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1679 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1680 vmcs12->host_rsp = evmcs->host_rsp; 1681 } 1682 1683 if (unlikely(!(evmcs->hv_clean_fields & 1684 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1685 vmcs12->ept_pointer = evmcs->ept_pointer; 1686 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1687 } 1688 1689 if (unlikely(!(evmcs->hv_clean_fields & 1690 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1691 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1692 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1693 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1694 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1695 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1696 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1697 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1698 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1699 vmcs12->guest_pending_dbg_exceptions = 1700 evmcs->guest_pending_dbg_exceptions; 1701 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1702 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1703 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1704 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1705 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1706 } 1707 1708 /* 1709 * Not used? 1710 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1711 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1712 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1713 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; 1714 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; 1715 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; 1716 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; 1717 * vmcs12->page_fault_error_code_mask = 1718 * evmcs->page_fault_error_code_mask; 1719 * vmcs12->page_fault_error_code_match = 1720 * evmcs->page_fault_error_code_match; 1721 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1722 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1723 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1724 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1725 */ 1726 1727 /* 1728 * Read only fields: 1729 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1730 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1731 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1732 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1733 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1734 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1735 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1736 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1737 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1738 * vmcs12->exit_qualification = evmcs->exit_qualification; 1739 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1740 * 1741 * Not present in struct vmcs12: 1742 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1743 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1744 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1745 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1746 */ 1747 1748 return 0; 1749 } 1750 1751 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1752 { 1753 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1754 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1755 1756 /* 1757 * Should not be changed by KVM: 1758 * 1759 * evmcs->host_es_selector = vmcs12->host_es_selector; 1760 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1761 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1762 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1763 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1764 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1765 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1766 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1767 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1768 * evmcs->host_cr0 = vmcs12->host_cr0; 1769 * evmcs->host_cr3 = vmcs12->host_cr3; 1770 * evmcs->host_cr4 = vmcs12->host_cr4; 1771 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1772 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1773 * evmcs->host_rip = vmcs12->host_rip; 1774 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1775 * evmcs->host_fs_base = vmcs12->host_fs_base; 1776 * evmcs->host_gs_base = vmcs12->host_gs_base; 1777 * evmcs->host_tr_base = vmcs12->host_tr_base; 1778 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1779 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1780 * evmcs->host_rsp = vmcs12->host_rsp; 1781 * sync_vmcs02_to_vmcs12() doesn't read these: 1782 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1783 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1784 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1785 * evmcs->ept_pointer = vmcs12->ept_pointer; 1786 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1787 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1788 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1789 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1790 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; 1791 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; 1792 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; 1793 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; 1794 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1795 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1796 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1797 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1798 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1799 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1800 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1801 * evmcs->page_fault_error_code_mask = 1802 * vmcs12->page_fault_error_code_mask; 1803 * evmcs->page_fault_error_code_match = 1804 * vmcs12->page_fault_error_code_match; 1805 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1806 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1807 * evmcs->tsc_offset = vmcs12->tsc_offset; 1808 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1809 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1810 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1811 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1812 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1813 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1814 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1815 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1816 * 1817 * Not present in struct vmcs12: 1818 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1819 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1820 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1821 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1822 */ 1823 1824 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1825 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1826 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1827 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1828 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1829 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1830 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1831 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1832 1833 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1834 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1835 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1836 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1837 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1838 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1839 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1840 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1841 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1842 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1843 1844 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1845 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1846 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1847 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1848 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1849 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1850 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1851 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1852 1853 evmcs->guest_es_base = vmcs12->guest_es_base; 1854 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1855 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1856 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1857 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1858 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1859 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1860 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1861 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1862 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1863 1864 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1865 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1866 1867 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1868 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1869 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1870 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1871 1872 evmcs->guest_pending_dbg_exceptions = 1873 vmcs12->guest_pending_dbg_exceptions; 1874 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1875 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1876 1877 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1878 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1879 1880 evmcs->guest_cr0 = vmcs12->guest_cr0; 1881 evmcs->guest_cr3 = vmcs12->guest_cr3; 1882 evmcs->guest_cr4 = vmcs12->guest_cr4; 1883 evmcs->guest_dr7 = vmcs12->guest_dr7; 1884 1885 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1886 1887 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1888 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1889 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1890 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1891 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1892 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1893 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1894 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1895 1896 evmcs->exit_qualification = vmcs12->exit_qualification; 1897 1898 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1899 evmcs->guest_rsp = vmcs12->guest_rsp; 1900 evmcs->guest_rflags = vmcs12->guest_rflags; 1901 1902 evmcs->guest_interruptibility_info = 1903 vmcs12->guest_interruptibility_info; 1904 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1905 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1906 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1907 evmcs->vm_entry_exception_error_code = 1908 vmcs12->vm_entry_exception_error_code; 1909 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1910 1911 evmcs->guest_rip = vmcs12->guest_rip; 1912 1913 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1914 1915 return 0; 1916 } 1917 1918 /* 1919 * This is an equivalent of the nested hypervisor executing the vmptrld 1920 * instruction. 1921 */ 1922 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, 1923 bool from_launch) 1924 { 1925 struct vcpu_vmx *vmx = to_vmx(vcpu); 1926 bool evmcs_gpa_changed = false; 1927 u64 evmcs_gpa; 1928 1929 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1930 return 1; 1931 1932 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) 1933 return 1; 1934 1935 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1936 if (!vmx->nested.hv_evmcs) 1937 vmx->nested.current_vmptr = -1ull; 1938 1939 nested_release_evmcs(vcpu); 1940 1941 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1942 &vmx->nested.hv_evmcs_map)) 1943 return 0; 1944 1945 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1946 1947 /* 1948 * Currently, KVM only supports eVMCS version 1 1949 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 1950 * value to first u32 field of eVMCS which should specify eVMCS 1951 * VersionNumber. 1952 * 1953 * Guest should be aware of supported eVMCS versions by host by 1954 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 1955 * expected to set this CPUID leaf according to the value 1956 * returned in vmcs_version from nested_enable_evmcs(). 1957 * 1958 * However, it turns out that Microsoft Hyper-V fails to comply 1959 * to their own invented interface: When Hyper-V use eVMCS, it 1960 * just sets first u32 field of eVMCS to revision_id specified 1961 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 1962 * which is one of the supported versions specified in 1963 * CPUID.0x4000000A.EAX[0:15]. 1964 * 1965 * To overcome Hyper-V bug, we accept here either a supported 1966 * eVMCS version or VMCS12 revision_id as valid values for first 1967 * u32 field of eVMCS. 1968 */ 1969 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 1970 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 1971 nested_release_evmcs(vcpu); 1972 return 0; 1973 } 1974 1975 vmx->nested.dirty_vmcs12 = true; 1976 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 1977 1978 evmcs_gpa_changed = true; 1979 /* 1980 * Unlike normal vmcs12, enlightened vmcs12 is not fully 1981 * reloaded from guest's memory (read only fields, fields not 1982 * present in struct hv_enlightened_vmcs, ...). Make sure there 1983 * are no leftovers. 1984 */ 1985 if (from_launch) { 1986 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1987 memset(vmcs12, 0, sizeof(*vmcs12)); 1988 vmcs12->hdr.revision_id = VMCS12_REVISION; 1989 } 1990 1991 } 1992 1993 /* 1994 * Clean fields data can't de used on VMLAUNCH and when we switch 1995 * between different L2 guests as KVM keeps a single VMCS12 per L1. 1996 */ 1997 if (from_launch || evmcs_gpa_changed) 1998 vmx->nested.hv_evmcs->hv_clean_fields &= 1999 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2000 2001 return 1; 2002 } 2003 2004 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2005 { 2006 struct vcpu_vmx *vmx = to_vmx(vcpu); 2007 2008 /* 2009 * hv_evmcs may end up being not mapped after migration (when 2010 * L2 was running), map it here to make sure vmcs12 changes are 2011 * properly reflected. 2012 */ 2013 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) 2014 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 2015 2016 if (vmx->nested.hv_evmcs) { 2017 copy_vmcs12_to_enlightened(vmx); 2018 /* All fields are clean */ 2019 vmx->nested.hv_evmcs->hv_clean_fields |= 2020 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2021 } else { 2022 copy_vmcs12_to_shadow(vmx); 2023 } 2024 2025 vmx->nested.need_vmcs12_to_shadow_sync = false; 2026 } 2027 2028 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2029 { 2030 struct vcpu_vmx *vmx = 2031 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2032 2033 vmx->nested.preemption_timer_expired = true; 2034 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2035 kvm_vcpu_kick(&vmx->vcpu); 2036 2037 return HRTIMER_NORESTART; 2038 } 2039 2040 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 2041 { 2042 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 2043 struct vcpu_vmx *vmx = to_vmx(vcpu); 2044 2045 /* 2046 * A timer value of zero is architecturally guaranteed to cause 2047 * a VMExit prior to executing any instructions in the guest. 2048 */ 2049 if (preemption_timeout == 0) { 2050 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2051 return; 2052 } 2053 2054 if (vcpu->arch.virtual_tsc_khz == 0) 2055 return; 2056 2057 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2058 preemption_timeout *= 1000000; 2059 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2060 hrtimer_start(&vmx->nested.preemption_timer, 2061 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 2062 } 2063 2064 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2065 { 2066 if (vmx->nested.nested_run_pending && 2067 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2068 return vmcs12->guest_ia32_efer; 2069 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2070 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2071 else 2072 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2073 } 2074 2075 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2076 { 2077 /* 2078 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2079 * according to L0's settings (vmcs12 is irrelevant here). Host 2080 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2081 * will be set as needed prior to VMLAUNCH/VMRESUME. 2082 */ 2083 if (vmx->nested.vmcs02_initialized) 2084 return; 2085 vmx->nested.vmcs02_initialized = true; 2086 2087 /* 2088 * We don't care what the EPTP value is we just need to guarantee 2089 * it's valid so we don't get a false positive when doing early 2090 * consistency checks. 2091 */ 2092 if (enable_ept && nested_early_check) 2093 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); 2094 2095 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2096 if (cpu_has_vmx_vmfunc()) 2097 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2098 2099 if (cpu_has_vmx_posted_intr()) 2100 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2101 2102 if (cpu_has_vmx_msr_bitmap()) 2103 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2104 2105 /* 2106 * The PML address never changes, so it is constant in vmcs02. 2107 * Conceptually we want to copy the PML index from vmcs01 here, 2108 * and then back to vmcs01 on nested vmexit. But since we flush 2109 * the log and reset GUEST_PML_INDEX on each vmexit, the PML 2110 * index is also effectively constant in vmcs02. 2111 */ 2112 if (enable_pml) { 2113 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 2114 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 2115 } 2116 2117 if (cpu_has_vmx_encls_vmexit()) 2118 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 2119 2120 /* 2121 * Set the MSR load/store lists to match L0's settings. Only the 2122 * addresses are constant (for vmcs02), the counts can change based 2123 * on L2's behavior, e.g. switching to/from long mode. 2124 */ 2125 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2126 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2127 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2128 2129 vmx_set_constant_host_state(vmx); 2130 } 2131 2132 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2133 struct vmcs12 *vmcs12) 2134 { 2135 prepare_vmcs02_constant_state(vmx); 2136 2137 vmcs_write64(VMCS_LINK_POINTER, -1ull); 2138 2139 if (enable_vpid) { 2140 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2141 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2142 else 2143 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2144 } 2145 } 2146 2147 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2148 { 2149 u32 exec_control, vmcs12_exec_ctrl; 2150 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2151 2152 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) 2153 prepare_vmcs02_early_rare(vmx, vmcs12); 2154 2155 /* 2156 * PIN CONTROLS 2157 */ 2158 exec_control = vmx_pin_based_exec_ctrl(vmx); 2159 exec_control |= (vmcs12->pin_based_vm_exec_control & 2160 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2161 2162 /* Posted interrupts setting is only taken from vmcs12. */ 2163 if (nested_cpu_has_posted_intr(vmcs12)) { 2164 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2165 vmx->nested.pi_pending = false; 2166 } else { 2167 exec_control &= ~PIN_BASED_POSTED_INTR; 2168 } 2169 pin_controls_set(vmx, exec_control); 2170 2171 /* 2172 * EXEC CONTROLS 2173 */ 2174 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2175 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 2176 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 2177 exec_control &= ~CPU_BASED_TPR_SHADOW; 2178 exec_control |= vmcs12->cpu_based_vm_exec_control; 2179 2180 vmx->nested.l1_tpr_threshold = -1; 2181 if (exec_control & CPU_BASED_TPR_SHADOW) 2182 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2183 #ifdef CONFIG_X86_64 2184 else 2185 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2186 CPU_BASED_CR8_STORE_EXITING; 2187 #endif 2188 2189 /* 2190 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2191 * for I/O port accesses. 2192 */ 2193 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2194 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2195 2196 /* 2197 * This bit will be computed in nested_get_vmcs12_pages, because 2198 * we do not have access to L1's MSR bitmap yet. For now, keep 2199 * the same bit as before, hoping to avoid multiple VMWRITEs that 2200 * only set/clear this bit. 2201 */ 2202 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2203 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2204 2205 exec_controls_set(vmx, exec_control); 2206 2207 /* 2208 * SECONDARY EXEC CONTROLS 2209 */ 2210 if (cpu_has_secondary_exec_ctrls()) { 2211 exec_control = vmx->secondary_exec_control; 2212 2213 /* Take the following fields only from vmcs12 */ 2214 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2215 SECONDARY_EXEC_ENABLE_INVPCID | 2216 SECONDARY_EXEC_RDTSCP | 2217 SECONDARY_EXEC_XSAVES | 2218 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2219 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2220 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2221 SECONDARY_EXEC_ENABLE_VMFUNC); 2222 if (nested_cpu_has(vmcs12, 2223 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { 2224 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & 2225 ~SECONDARY_EXEC_ENABLE_PML; 2226 exec_control |= vmcs12_exec_ctrl; 2227 } 2228 2229 /* VMCS shadowing for L2 is emulated for now */ 2230 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2231 2232 /* 2233 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2234 * will not have to rewrite the controls just for this bit. 2235 */ 2236 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2237 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2238 exec_control |= SECONDARY_EXEC_DESC; 2239 2240 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2241 vmcs_write16(GUEST_INTR_STATUS, 2242 vmcs12->guest_intr_status); 2243 2244 secondary_exec_controls_set(vmx, exec_control); 2245 } 2246 2247 /* 2248 * ENTRY CONTROLS 2249 * 2250 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2251 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2252 * on the related bits (if supported by the CPU) in the hope that 2253 * we can avoid VMWrites during vmx_set_efer(). 2254 */ 2255 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & 2256 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; 2257 if (cpu_has_load_ia32_efer()) { 2258 if (guest_efer & EFER_LMA) 2259 exec_control |= VM_ENTRY_IA32E_MODE; 2260 if (guest_efer != host_efer) 2261 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2262 } 2263 vm_entry_controls_set(vmx, exec_control); 2264 2265 /* 2266 * EXIT CONTROLS 2267 * 2268 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2269 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2270 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2271 */ 2272 exec_control = vmx_vmexit_ctrl(); 2273 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2274 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2275 vm_exit_controls_set(vmx, exec_control); 2276 2277 /* 2278 * Interrupt/Exception Fields 2279 */ 2280 if (vmx->nested.nested_run_pending) { 2281 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2282 vmcs12->vm_entry_intr_info_field); 2283 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2284 vmcs12->vm_entry_exception_error_code); 2285 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2286 vmcs12->vm_entry_instruction_len); 2287 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2288 vmcs12->guest_interruptibility_info); 2289 vmx->loaded_vmcs->nmi_known_unmasked = 2290 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2291 } else { 2292 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2293 } 2294 } 2295 2296 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2297 { 2298 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2299 2300 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2301 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2302 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2303 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2304 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2305 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2306 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2307 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2308 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2309 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2310 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2311 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2312 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2313 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2314 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2315 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2316 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2317 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2318 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2319 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2320 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2321 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2322 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2323 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2324 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2325 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2326 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2327 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2328 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2329 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2330 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2331 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2332 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2333 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2334 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2335 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2336 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2337 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2338 } 2339 2340 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2341 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2342 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2343 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2344 vmcs12->guest_pending_dbg_exceptions); 2345 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2346 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2347 2348 /* 2349 * L1 may access the L2's PDPTR, so save them to construct 2350 * vmcs12 2351 */ 2352 if (enable_ept) { 2353 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2354 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2355 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2356 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2357 } 2358 2359 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2360 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2361 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2362 } 2363 2364 if (nested_cpu_has_xsaves(vmcs12)) 2365 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2366 2367 /* 2368 * Whether page-faults are trapped is determined by a combination of 2369 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 2370 * If enable_ept, L0 doesn't care about page faults and we should 2371 * set all of these to L1's desires. However, if !enable_ept, L0 does 2372 * care about (at least some) page faults, and because it is not easy 2373 * (if at all possible?) to merge L0 and L1's desires, we simply ask 2374 * to exit on each and every L2 page fault. This is done by setting 2375 * MASK=MATCH=0 and (see below) EB.PF=1. 2376 * Note that below we don't need special code to set EB.PF beyond the 2377 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2378 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2379 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2380 */ 2381 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 2382 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 2383 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 2384 enable_ept ? vmcs12->page_fault_error_code_match : 0); 2385 2386 if (cpu_has_vmx_apicv()) { 2387 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2388 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2389 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2390 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2391 } 2392 2393 /* 2394 * Make sure the msr_autostore list is up to date before we set the 2395 * count in the vmcs02. 2396 */ 2397 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2398 2399 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2400 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2401 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2402 2403 set_cr4_guest_host_mask(vmx); 2404 } 2405 2406 /* 2407 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2408 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2409 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2410 * guest in a way that will both be appropriate to L1's requests, and our 2411 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2412 * function also has additional necessary side-effects, like setting various 2413 * vcpu->arch fields. 2414 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2415 * is assigned to entry_failure_code on failure. 2416 */ 2417 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2418 u32 *entry_failure_code) 2419 { 2420 struct vcpu_vmx *vmx = to_vmx(vcpu); 2421 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2422 bool load_guest_pdptrs_vmcs12 = false; 2423 2424 if (vmx->nested.dirty_vmcs12 || hv_evmcs) { 2425 prepare_vmcs02_rare(vmx, vmcs12); 2426 vmx->nested.dirty_vmcs12 = false; 2427 2428 load_guest_pdptrs_vmcs12 = !hv_evmcs || 2429 !(hv_evmcs->hv_clean_fields & 2430 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2431 } 2432 2433 if (vmx->nested.nested_run_pending && 2434 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2435 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2436 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2437 } else { 2438 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2439 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2440 } 2441 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2442 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2443 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2444 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2445 2446 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2447 * bitwise-or of what L1 wants to trap for L2, and what we want to 2448 * trap. Note that CR0.TS also needs updating - we do this later. 2449 */ 2450 update_exception_bitmap(vcpu); 2451 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2452 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2453 2454 if (vmx->nested.nested_run_pending && 2455 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2456 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2457 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2458 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2459 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2460 } 2461 2462 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2463 2464 if (kvm_has_tsc_control) 2465 decache_tsc_multiplier(vmx); 2466 2467 if (enable_vpid) { 2468 /* 2469 * There is no direct mapping between vpid02 and vpid12, the 2470 * vpid02 is per-vCPU for L0 and reused while the value of 2471 * vpid12 is changed w/ one invvpid during nested vmentry. 2472 * The vpid12 is allocated by L1 for L2, so it will not 2473 * influence global bitmap(for vpid01 and vpid02 allocation) 2474 * even if spawn a lot of nested vCPUs. 2475 */ 2476 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { 2477 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 2478 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 2479 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); 2480 } 2481 } else { 2482 /* 2483 * If L1 use EPT, then L0 needs to execute INVEPT on 2484 * EPTP02 instead of EPTP01. Therefore, delay TLB 2485 * flush until vmcs02->eptp is fully updated by 2486 * KVM_REQ_LOAD_CR3. Note that this assumes 2487 * KVM_REQ_TLB_FLUSH is evaluated after 2488 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). 2489 */ 2490 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2491 } 2492 } 2493 2494 if (nested_cpu_has_ept(vmcs12)) 2495 nested_ept_init_mmu_context(vcpu); 2496 2497 /* 2498 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2499 * bits which we consider mandatory enabled. 2500 * The CR0_READ_SHADOW is what L2 should have expected to read given 2501 * the specifications by L1; It's not enough to take 2502 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2503 * have more bits than L1 expected. 2504 */ 2505 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2506 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2507 2508 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2509 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2510 2511 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2512 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2513 vmx_set_efer(vcpu, vcpu->arch.efer); 2514 2515 /* 2516 * Guest state is invalid and unrestricted guest is disabled, 2517 * which means L1 attempted VMEntry to L2 with invalid state. 2518 * Fail the VMEntry. 2519 */ 2520 if (vmx->emulation_required) { 2521 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2522 return -EINVAL; 2523 } 2524 2525 /* Shadow page tables on either EPT or shadow page tables. */ 2526 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2527 entry_failure_code)) 2528 return -EINVAL; 2529 2530 /* 2531 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2532 * on nested VM-Exit, which can occur without actually running L2 and 2533 * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with 2534 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2535 * transition to HLT instead of running L2. 2536 */ 2537 if (enable_ept) 2538 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2539 2540 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2541 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2542 is_pae_paging(vcpu)) { 2543 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2544 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2545 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2546 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2547 } 2548 2549 if (!enable_ept) 2550 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2551 2552 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2553 SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2554 vmcs12->guest_ia32_perf_global_ctrl)) 2555 return -EINVAL; 2556 2557 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2558 kvm_rip_write(vcpu, vmcs12->guest_rip); 2559 return 0; 2560 } 2561 2562 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2563 { 2564 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2565 nested_cpu_has_virtual_nmis(vmcs12))) 2566 return -EINVAL; 2567 2568 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2569 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) 2570 return -EINVAL; 2571 2572 return 0; 2573 } 2574 2575 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) 2576 { 2577 struct vcpu_vmx *vmx = to_vmx(vcpu); 2578 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2579 2580 /* Check for memory type validity */ 2581 switch (address & VMX_EPTP_MT_MASK) { 2582 case VMX_EPTP_MT_UC: 2583 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2584 return false; 2585 break; 2586 case VMX_EPTP_MT_WB: 2587 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2588 return false; 2589 break; 2590 default: 2591 return false; 2592 } 2593 2594 /* only 4 levels page-walk length are valid */ 2595 if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) 2596 return false; 2597 2598 /* Reserved bits should not be set */ 2599 if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) 2600 return false; 2601 2602 /* AD, if set, should be supported */ 2603 if (address & VMX_EPTP_AD_ENABLE_BIT) { 2604 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2605 return false; 2606 } 2607 2608 return true; 2609 } 2610 2611 /* 2612 * Checks related to VM-Execution Control Fields 2613 */ 2614 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2615 struct vmcs12 *vmcs12) 2616 { 2617 struct vcpu_vmx *vmx = to_vmx(vcpu); 2618 2619 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2620 vmx->nested.msrs.pinbased_ctls_low, 2621 vmx->nested.msrs.pinbased_ctls_high)) || 2622 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2623 vmx->nested.msrs.procbased_ctls_low, 2624 vmx->nested.msrs.procbased_ctls_high))) 2625 return -EINVAL; 2626 2627 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2628 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2629 vmx->nested.msrs.secondary_ctls_low, 2630 vmx->nested.msrs.secondary_ctls_high))) 2631 return -EINVAL; 2632 2633 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2634 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2635 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2636 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2637 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2638 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2639 nested_vmx_check_nmi_controls(vmcs12) || 2640 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2641 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2642 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2643 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2644 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2645 return -EINVAL; 2646 2647 if (!nested_cpu_has_preemption_timer(vmcs12) && 2648 nested_cpu_has_save_preemption_timer(vmcs12)) 2649 return -EINVAL; 2650 2651 if (nested_cpu_has_ept(vmcs12) && 2652 CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) 2653 return -EINVAL; 2654 2655 if (nested_cpu_has_vmfunc(vmcs12)) { 2656 if (CC(vmcs12->vm_function_control & 2657 ~vmx->nested.msrs.vmfunc_controls)) 2658 return -EINVAL; 2659 2660 if (nested_cpu_has_eptp_switching(vmcs12)) { 2661 if (CC(!nested_cpu_has_ept(vmcs12)) || 2662 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2663 return -EINVAL; 2664 } 2665 } 2666 2667 return 0; 2668 } 2669 2670 /* 2671 * Checks related to VM-Exit Control Fields 2672 */ 2673 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2674 struct vmcs12 *vmcs12) 2675 { 2676 struct vcpu_vmx *vmx = to_vmx(vcpu); 2677 2678 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2679 vmx->nested.msrs.exit_ctls_low, 2680 vmx->nested.msrs.exit_ctls_high)) || 2681 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2682 return -EINVAL; 2683 2684 return 0; 2685 } 2686 2687 /* 2688 * Checks related to VM-Entry Control Fields 2689 */ 2690 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2691 struct vmcs12 *vmcs12) 2692 { 2693 struct vcpu_vmx *vmx = to_vmx(vcpu); 2694 2695 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2696 vmx->nested.msrs.entry_ctls_low, 2697 vmx->nested.msrs.entry_ctls_high))) 2698 return -EINVAL; 2699 2700 /* 2701 * From the Intel SDM, volume 3: 2702 * Fields relevant to VM-entry event injection must be set properly. 2703 * These fields are the VM-entry interruption-information field, the 2704 * VM-entry exception error code, and the VM-entry instruction length. 2705 */ 2706 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2707 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2708 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2709 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2710 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2711 bool should_have_error_code; 2712 bool urg = nested_cpu_has2(vmcs12, 2713 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2714 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2715 2716 /* VM-entry interruption-info field: interruption type */ 2717 if (CC(intr_type == INTR_TYPE_RESERVED) || 2718 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2719 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2720 return -EINVAL; 2721 2722 /* VM-entry interruption-info field: vector */ 2723 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2724 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2725 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2726 return -EINVAL; 2727 2728 /* VM-entry interruption-info field: deliver error code */ 2729 should_have_error_code = 2730 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2731 x86_exception_has_error_code(vector); 2732 if (CC(has_error_code != should_have_error_code)) 2733 return -EINVAL; 2734 2735 /* VM-entry exception error code */ 2736 if (CC(has_error_code && 2737 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2738 return -EINVAL; 2739 2740 /* VM-entry interruption-info field: reserved bits */ 2741 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2742 return -EINVAL; 2743 2744 /* VM-entry instruction length */ 2745 switch (intr_type) { 2746 case INTR_TYPE_SOFT_EXCEPTION: 2747 case INTR_TYPE_SOFT_INTR: 2748 case INTR_TYPE_PRIV_SW_EXCEPTION: 2749 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2750 CC(vmcs12->vm_entry_instruction_len == 0 && 2751 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2752 return -EINVAL; 2753 } 2754 } 2755 2756 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2757 return -EINVAL; 2758 2759 return 0; 2760 } 2761 2762 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2763 struct vmcs12 *vmcs12) 2764 { 2765 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2766 nested_check_vm_exit_controls(vcpu, vmcs12) || 2767 nested_check_vm_entry_controls(vcpu, vmcs12)) 2768 return -EINVAL; 2769 2770 return 0; 2771 } 2772 2773 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2774 struct vmcs12 *vmcs12) 2775 { 2776 bool ia32e; 2777 2778 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2779 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2780 CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) 2781 return -EINVAL; 2782 2783 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2784 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2785 return -EINVAL; 2786 2787 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2788 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2789 return -EINVAL; 2790 2791 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2792 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2793 vmcs12->host_ia32_perf_global_ctrl))) 2794 return -EINVAL; 2795 2796 #ifdef CONFIG_X86_64 2797 ia32e = !!(vcpu->arch.efer & EFER_LMA); 2798 #else 2799 ia32e = false; 2800 #endif 2801 2802 if (ia32e) { 2803 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || 2804 CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2805 return -EINVAL; 2806 } else { 2807 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || 2808 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2809 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2810 CC((vmcs12->host_rip) >> 32)) 2811 return -EINVAL; 2812 } 2813 2814 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2815 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2816 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2817 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2818 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2819 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2820 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2821 CC(vmcs12->host_cs_selector == 0) || 2822 CC(vmcs12->host_tr_selector == 0) || 2823 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2824 return -EINVAL; 2825 2826 #ifdef CONFIG_X86_64 2827 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2828 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2829 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2830 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2831 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2832 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2833 return -EINVAL; 2834 #endif 2835 2836 /* 2837 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2838 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2839 * the values of the LMA and LME bits in the field must each be that of 2840 * the host address-space size VM-exit control. 2841 */ 2842 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2843 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2844 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2845 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2846 return -EINVAL; 2847 } 2848 2849 return 0; 2850 } 2851 2852 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2853 struct vmcs12 *vmcs12) 2854 { 2855 int r = 0; 2856 struct vmcs12 *shadow; 2857 struct kvm_host_map map; 2858 2859 if (vmcs12->vmcs_link_pointer == -1ull) 2860 return 0; 2861 2862 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2863 return -EINVAL; 2864 2865 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2866 return -EINVAL; 2867 2868 shadow = map.hva; 2869 2870 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2871 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2872 r = -EINVAL; 2873 2874 kvm_vcpu_unmap(vcpu, &map, false); 2875 return r; 2876 } 2877 2878 /* 2879 * Checks related to Guest Non-register State 2880 */ 2881 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2882 { 2883 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2884 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) 2885 return -EINVAL; 2886 2887 return 0; 2888 } 2889 2890 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2891 struct vmcs12 *vmcs12, 2892 u32 *exit_qual) 2893 { 2894 bool ia32e; 2895 2896 *exit_qual = ENTRY_FAIL_DEFAULT; 2897 2898 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2899 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2900 return -EINVAL; 2901 2902 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2903 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2904 return -EINVAL; 2905 2906 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2907 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; 2908 return -EINVAL; 2909 } 2910 2911 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2912 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2913 vmcs12->guest_ia32_perf_global_ctrl))) 2914 return -EINVAL; 2915 2916 /* 2917 * If the load IA32_EFER VM-entry control is 1, the following checks 2918 * are performed on the field for the IA32_EFER MSR: 2919 * - Bits reserved in the IA32_EFER MSR must be 0. 2920 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2921 * the IA-32e mode guest VM-exit control. It must also be identical 2922 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2923 * CR0.PG) is 1. 2924 */ 2925 if (to_vmx(vcpu)->nested.nested_run_pending && 2926 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2927 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2928 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2929 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2930 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2931 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2932 return -EINVAL; 2933 } 2934 2935 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2936 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 2937 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 2938 return -EINVAL; 2939 2940 if (nested_check_guest_non_reg_state(vmcs12)) 2941 return -EINVAL; 2942 2943 return 0; 2944 } 2945 2946 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 2947 { 2948 struct vcpu_vmx *vmx = to_vmx(vcpu); 2949 unsigned long cr3, cr4; 2950 bool vm_fail; 2951 2952 if (!nested_early_check) 2953 return 0; 2954 2955 if (vmx->msr_autoload.host.nr) 2956 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 2957 if (vmx->msr_autoload.guest.nr) 2958 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 2959 2960 preempt_disable(); 2961 2962 vmx_prepare_switch_to_guest(vcpu); 2963 2964 /* 2965 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 2966 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 2967 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. 2968 * there is no need to preserve other bits or save/restore the field. 2969 */ 2970 vmcs_writel(GUEST_RFLAGS, 0); 2971 2972 cr3 = __get_current_cr3_fast(); 2973 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 2974 vmcs_writel(HOST_CR3, cr3); 2975 vmx->loaded_vmcs->host_state.cr3 = cr3; 2976 } 2977 2978 cr4 = cr4_read_shadow(); 2979 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 2980 vmcs_writel(HOST_CR4, cr4); 2981 vmx->loaded_vmcs->host_state.cr4 = cr4; 2982 } 2983 2984 asm( 2985 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ 2986 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2987 "je 1f \n\t" 2988 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" 2989 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2990 "1: \n\t" 2991 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ 2992 2993 /* Check if vmlaunch or vmresume is needed */ 2994 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" 2995 2996 /* 2997 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set 2998 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail 2999 * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the 3000 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. 3001 */ 3002 "call vmx_vmenter\n\t" 3003 3004 CC_SET(be) 3005 : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) 3006 : [HOST_RSP]"r"((unsigned long)HOST_RSP), 3007 [loaded_vmcs]"r"(vmx->loaded_vmcs), 3008 [launched]"i"(offsetof(struct loaded_vmcs, launched)), 3009 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), 3010 [wordsize]"i"(sizeof(ulong)) 3011 : "memory" 3012 ); 3013 3014 if (vmx->msr_autoload.host.nr) 3015 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3016 if (vmx->msr_autoload.guest.nr) 3017 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3018 3019 if (vm_fail) { 3020 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3021 3022 preempt_enable(); 3023 3024 trace_kvm_nested_vmenter_failed( 3025 "early hardware check VM-instruction error: ", error); 3026 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3027 return 1; 3028 } 3029 3030 /* 3031 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3032 */ 3033 local_irq_enable(); 3034 if (hw_breakpoint_active()) 3035 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3036 preempt_enable(); 3037 3038 /* 3039 * A non-failing VMEntry means we somehow entered guest mode with 3040 * an illegal RIP, and that's just the tip of the iceberg. There 3041 * is no telling what memory has been modified or what state has 3042 * been exposed to unknown code. Hitting this all but guarantees 3043 * a (very critical) hardware issue. 3044 */ 3045 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3046 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3047 3048 return 0; 3049 } 3050 3051 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 3052 struct vmcs12 *vmcs12); 3053 3054 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3055 { 3056 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3057 struct vcpu_vmx *vmx = to_vmx(vcpu); 3058 struct kvm_host_map *map; 3059 struct page *page; 3060 u64 hpa; 3061 3062 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3063 /* 3064 * Translate L1 physical address to host physical 3065 * address for vmcs02. Keep the page pinned, so this 3066 * physical address remains valid. We keep a reference 3067 * to it so we can release it later. 3068 */ 3069 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 3070 kvm_release_page_clean(vmx->nested.apic_access_page); 3071 vmx->nested.apic_access_page = NULL; 3072 } 3073 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 3074 if (!is_error_page(page)) { 3075 vmx->nested.apic_access_page = page; 3076 hpa = page_to_phys(vmx->nested.apic_access_page); 3077 vmcs_write64(APIC_ACCESS_ADDR, hpa); 3078 } else { 3079 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 3080 __func__); 3081 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3082 vcpu->run->internal.suberror = 3083 KVM_INTERNAL_ERROR_EMULATION; 3084 vcpu->run->internal.ndata = 0; 3085 return false; 3086 } 3087 } 3088 3089 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3090 map = &vmx->nested.virtual_apic_map; 3091 3092 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3093 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3094 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3095 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3096 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3097 /* 3098 * The processor will never use the TPR shadow, simply 3099 * clear the bit from the execution control. Such a 3100 * configuration is useless, but it happens in tests. 3101 * For any other configuration, failing the vm entry is 3102 * _not_ what the processor does but it's basically the 3103 * only possibility we have. 3104 */ 3105 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3106 } else { 3107 /* 3108 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3109 * force VM-Entry to fail. 3110 */ 3111 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 3112 } 3113 } 3114 3115 if (nested_cpu_has_posted_intr(vmcs12)) { 3116 map = &vmx->nested.pi_desc_map; 3117 3118 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3119 vmx->nested.pi_desc = 3120 (struct pi_desc *)(((void *)map->hva) + 3121 offset_in_page(vmcs12->posted_intr_desc_addr)); 3122 vmcs_write64(POSTED_INTR_DESC_ADDR, 3123 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3124 } 3125 } 3126 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3127 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3128 else 3129 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3130 return true; 3131 } 3132 3133 /* 3134 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3135 * for running VMX instructions (except VMXON, whose prerequisites are 3136 * slightly different). It also specifies what exception to inject otherwise. 3137 * Note that many of these exceptions have priority over VM exits, so they 3138 * don't have to be checked again here. 3139 */ 3140 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3141 { 3142 if (!to_vmx(vcpu)->nested.vmxon) { 3143 kvm_queue_exception(vcpu, UD_VECTOR); 3144 return 0; 3145 } 3146 3147 if (vmx_get_cpl(vcpu)) { 3148 kvm_inject_gp(vcpu, 0); 3149 return 0; 3150 } 3151 3152 return 1; 3153 } 3154 3155 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3156 { 3157 u8 rvi = vmx_get_rvi(); 3158 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3159 3160 return ((rvi & 0xf0) > (vppr & 0xf0)); 3161 } 3162 3163 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3164 struct vmcs12 *vmcs12); 3165 3166 /* 3167 * If from_vmentry is false, this is being called from state restore (either RSM 3168 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3169 * 3170 * Returns: 3171 * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode 3172 * NVMX_ENTRY_VMFAIL: Consistency check VMFail 3173 * NVMX_ENTRY_VMEXIT: Consistency check VMExit 3174 * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error 3175 */ 3176 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3177 bool from_vmentry) 3178 { 3179 struct vcpu_vmx *vmx = to_vmx(vcpu); 3180 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3181 bool evaluate_pending_interrupts; 3182 u32 exit_reason = EXIT_REASON_INVALID_STATE; 3183 u32 exit_qual; 3184 3185 evaluate_pending_interrupts = exec_controls_get(vmx) & 3186 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); 3187 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3188 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3189 3190 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3191 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3192 if (kvm_mpx_supported() && 3193 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3194 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3195 3196 /* 3197 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3198 * nested early checks are disabled. In the event of a "late" VM-Fail, 3199 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3200 * software model to the pre-VMEntry host state. When EPT is disabled, 3201 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3202 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3203 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3204 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3205 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3206 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3207 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3208 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3209 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3210 * path would need to manually save/restore vmcs01.GUEST_CR3. 3211 */ 3212 if (!enable_ept && !nested_early_check) 3213 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3214 3215 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3216 3217 prepare_vmcs02_early(vmx, vmcs12); 3218 3219 if (from_vmentry) { 3220 if (unlikely(!nested_get_vmcs12_pages(vcpu))) 3221 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3222 3223 if (nested_vmx_check_vmentry_hw(vcpu)) { 3224 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3225 return NVMX_VMENTRY_VMFAIL; 3226 } 3227 3228 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 3229 goto vmentry_fail_vmexit; 3230 } 3231 3232 enter_guest_mode(vcpu); 3233 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3234 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 3235 3236 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) 3237 goto vmentry_fail_vmexit_guest_mode; 3238 3239 if (from_vmentry) { 3240 exit_reason = EXIT_REASON_MSR_LOAD_FAIL; 3241 exit_qual = nested_vmx_load_msr(vcpu, 3242 vmcs12->vm_entry_msr_load_addr, 3243 vmcs12->vm_entry_msr_load_count); 3244 if (exit_qual) 3245 goto vmentry_fail_vmexit_guest_mode; 3246 } else { 3247 /* 3248 * The MMU is not initialized to point at the right entities yet and 3249 * "get pages" would need to read data from the guest (i.e. we will 3250 * need to perform gpa to hpa translation). Request a call 3251 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3252 * have already been set at vmentry time and should not be reset. 3253 */ 3254 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 3255 } 3256 3257 /* 3258 * If L1 had a pending IRQ/NMI until it executed 3259 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3260 * disallowed (e.g. interrupts disabled), L0 needs to 3261 * evaluate if this pending event should cause an exit from L2 3262 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3263 * intercept EXTERNAL_INTERRUPT). 3264 * 3265 * Usually this would be handled by the processor noticing an 3266 * IRQ/NMI window request, or checking RVI during evaluation of 3267 * pending virtual interrupts. However, this setting was done 3268 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3269 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3270 */ 3271 if (unlikely(evaluate_pending_interrupts)) 3272 kvm_make_request(KVM_REQ_EVENT, vcpu); 3273 3274 /* 3275 * Do not start the preemption timer hrtimer until after we know 3276 * we are successful, so that only nested_vmx_vmexit needs to cancel 3277 * the timer. 3278 */ 3279 vmx->nested.preemption_timer_expired = false; 3280 if (nested_cpu_has_preemption_timer(vmcs12)) 3281 vmx_start_preemption_timer(vcpu); 3282 3283 /* 3284 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3285 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3286 * returned as far as L1 is concerned. It will only return (and set 3287 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3288 */ 3289 return NVMX_VMENTRY_SUCCESS; 3290 3291 /* 3292 * A failed consistency check that leads to a VMExit during L1's 3293 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3294 * 26.7 "VM-entry failures during or after loading guest state". 3295 */ 3296 vmentry_fail_vmexit_guest_mode: 3297 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3298 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3299 leave_guest_mode(vcpu); 3300 3301 vmentry_fail_vmexit: 3302 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3303 3304 if (!from_vmentry) 3305 return NVMX_VMENTRY_VMEXIT; 3306 3307 load_vmcs12_host_state(vcpu, vmcs12); 3308 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 3309 vmcs12->exit_qualification = exit_qual; 3310 if (enable_shadow_vmcs || vmx->nested.hv_evmcs) 3311 vmx->nested.need_vmcs12_to_shadow_sync = true; 3312 return NVMX_VMENTRY_VMEXIT; 3313 } 3314 3315 /* 3316 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3317 * for running an L2 nested guest. 3318 */ 3319 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3320 { 3321 struct vmcs12 *vmcs12; 3322 enum nvmx_vmentry_status status; 3323 struct vcpu_vmx *vmx = to_vmx(vcpu); 3324 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3325 3326 if (!nested_vmx_check_permission(vcpu)) 3327 return 1; 3328 3329 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) 3330 return 1; 3331 3332 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) 3333 return nested_vmx_failInvalid(vcpu); 3334 3335 vmcs12 = get_vmcs12(vcpu); 3336 3337 /* 3338 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3339 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3340 * rather than RFLAGS.ZF, and no error number is stored to the 3341 * VM-instruction error field. 3342 */ 3343 if (vmcs12->hdr.shadow_vmcs) 3344 return nested_vmx_failInvalid(vcpu); 3345 3346 if (vmx->nested.hv_evmcs) { 3347 copy_enlightened_to_vmcs12(vmx); 3348 /* Enlightened VMCS doesn't have launch state */ 3349 vmcs12->launch_state = !launch; 3350 } else if (enable_shadow_vmcs) { 3351 copy_shadow_to_vmcs12(vmx); 3352 } 3353 3354 /* 3355 * The nested entry process starts with enforcing various prerequisites 3356 * on vmcs12 as required by the Intel SDM, and act appropriately when 3357 * they fail: As the SDM explains, some conditions should cause the 3358 * instruction to fail, while others will cause the instruction to seem 3359 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3360 * To speed up the normal (success) code path, we should avoid checking 3361 * for misconfigurations which will anyway be caught by the processor 3362 * when using the merged vmcs02. 3363 */ 3364 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) 3365 return nested_vmx_failValid(vcpu, 3366 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3367 3368 if (vmcs12->launch_state == launch) 3369 return nested_vmx_failValid(vcpu, 3370 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3371 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3372 3373 if (nested_vmx_check_controls(vcpu, vmcs12)) 3374 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3375 3376 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3377 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3378 3379 /* 3380 * We're finally done with prerequisite checking, and can start with 3381 * the nested entry. 3382 */ 3383 vmx->nested.nested_run_pending = 1; 3384 status = nested_vmx_enter_non_root_mode(vcpu, true); 3385 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3386 goto vmentry_failed; 3387 3388 /* Hide L1D cache contents from the nested guest. */ 3389 vmx->vcpu.arch.l1tf_flush_l1d = true; 3390 3391 /* 3392 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3393 * also be used as part of restoring nVMX state for 3394 * snapshot restore (migration). 3395 * 3396 * In this flow, it is assumed that vmcs12 cache was 3397 * trasferred as part of captured nVMX state and should 3398 * therefore not be read from guest memory (which may not 3399 * exist on destination host yet). 3400 */ 3401 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3402 3403 /* 3404 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3405 * awakened by event injection or by an NMI-window VM-exit or 3406 * by an interrupt-window VM-exit, halt the vcpu. 3407 */ 3408 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && 3409 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3410 !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && 3411 !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && 3412 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3413 vmx->nested.nested_run_pending = 0; 3414 return kvm_vcpu_halt(vcpu); 3415 } 3416 return 1; 3417 3418 vmentry_failed: 3419 vmx->nested.nested_run_pending = 0; 3420 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3421 return 0; 3422 if (status == NVMX_VMENTRY_VMEXIT) 3423 return 1; 3424 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3425 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3426 } 3427 3428 /* 3429 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3430 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 3431 * This function returns the new value we should put in vmcs12.guest_cr0. 3432 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3433 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3434 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3435 * didn't trap the bit, because if L1 did, so would L0). 3436 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3437 * been modified by L2, and L1 knows it. So just leave the old value of 3438 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3439 * isn't relevant, because if L0 traps this bit it can set it to anything. 3440 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3441 * changed these bits, and therefore they need to be updated, but L0 3442 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3443 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3444 */ 3445 static inline unsigned long 3446 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3447 { 3448 return 3449 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3450 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3451 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3452 vcpu->arch.cr0_guest_owned_bits)); 3453 } 3454 3455 static inline unsigned long 3456 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3457 { 3458 return 3459 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3460 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3461 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3462 vcpu->arch.cr4_guest_owned_bits)); 3463 } 3464 3465 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3466 struct vmcs12 *vmcs12) 3467 { 3468 u32 idt_vectoring; 3469 unsigned int nr; 3470 3471 if (vcpu->arch.exception.injected) { 3472 nr = vcpu->arch.exception.nr; 3473 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3474 3475 if (kvm_exception_is_soft(nr)) { 3476 vmcs12->vm_exit_instruction_len = 3477 vcpu->arch.event_exit_inst_len; 3478 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3479 } else 3480 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3481 3482 if (vcpu->arch.exception.has_error_code) { 3483 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3484 vmcs12->idt_vectoring_error_code = 3485 vcpu->arch.exception.error_code; 3486 } 3487 3488 vmcs12->idt_vectoring_info_field = idt_vectoring; 3489 } else if (vcpu->arch.nmi_injected) { 3490 vmcs12->idt_vectoring_info_field = 3491 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3492 } else if (vcpu->arch.interrupt.injected) { 3493 nr = vcpu->arch.interrupt.nr; 3494 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3495 3496 if (vcpu->arch.interrupt.soft) { 3497 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3498 vmcs12->vm_entry_instruction_len = 3499 vcpu->arch.event_exit_inst_len; 3500 } else 3501 idt_vectoring |= INTR_TYPE_EXT_INTR; 3502 3503 vmcs12->idt_vectoring_info_field = idt_vectoring; 3504 } 3505 } 3506 3507 3508 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3509 { 3510 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3511 gfn_t gfn; 3512 3513 /* 3514 * Don't need to mark the APIC access page dirty; it is never 3515 * written to by the CPU during APIC virtualization. 3516 */ 3517 3518 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3519 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3520 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3521 } 3522 3523 if (nested_cpu_has_posted_intr(vmcs12)) { 3524 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3525 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3526 } 3527 } 3528 3529 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3530 { 3531 struct vcpu_vmx *vmx = to_vmx(vcpu); 3532 int max_irr; 3533 void *vapic_page; 3534 u16 status; 3535 3536 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) 3537 return; 3538 3539 vmx->nested.pi_pending = false; 3540 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3541 return; 3542 3543 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3544 if (max_irr != 256) { 3545 vapic_page = vmx->nested.virtual_apic_map.hva; 3546 if (!vapic_page) 3547 return; 3548 3549 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3550 vapic_page, &max_irr); 3551 status = vmcs_read16(GUEST_INTR_STATUS); 3552 if ((u8)max_irr > ((u8)status & 0xff)) { 3553 status &= ~0xff; 3554 status |= (u8)max_irr; 3555 vmcs_write16(GUEST_INTR_STATUS, status); 3556 } 3557 } 3558 3559 nested_mark_vmcs12_pages_dirty(vcpu); 3560 } 3561 3562 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3563 unsigned long exit_qual) 3564 { 3565 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3566 unsigned int nr = vcpu->arch.exception.nr; 3567 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3568 3569 if (vcpu->arch.exception.has_error_code) { 3570 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3571 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3572 } 3573 3574 if (kvm_exception_is_soft(nr)) 3575 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3576 else 3577 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3578 3579 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3580 vmx_get_nmi_mask(vcpu)) 3581 intr_info |= INTR_INFO_UNBLOCK_NMI; 3582 3583 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3584 } 3585 3586 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 3587 { 3588 struct vcpu_vmx *vmx = to_vmx(vcpu); 3589 unsigned long exit_qual; 3590 bool block_nested_events = 3591 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3592 struct kvm_lapic *apic = vcpu->arch.apic; 3593 3594 if (lapic_in_kernel(vcpu) && 3595 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3596 if (block_nested_events) 3597 return -EBUSY; 3598 clear_bit(KVM_APIC_INIT, &apic->pending_events); 3599 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3600 return 0; 3601 } 3602 3603 if (vcpu->arch.exception.pending && 3604 nested_vmx_check_exception(vcpu, &exit_qual)) { 3605 if (block_nested_events) 3606 return -EBUSY; 3607 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3608 return 0; 3609 } 3610 3611 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3612 vmx->nested.preemption_timer_expired) { 3613 if (block_nested_events) 3614 return -EBUSY; 3615 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3616 return 0; 3617 } 3618 3619 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 3620 if (block_nested_events) 3621 return -EBUSY; 3622 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3623 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3624 INTR_INFO_VALID_MASK, 0); 3625 /* 3626 * The NMI-triggered VM exit counts as injection: 3627 * clear this one and block further NMIs. 3628 */ 3629 vcpu->arch.nmi_pending = 0; 3630 vmx_set_nmi_mask(vcpu, true); 3631 return 0; 3632 } 3633 3634 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && 3635 nested_exit_on_intr(vcpu)) { 3636 if (block_nested_events) 3637 return -EBUSY; 3638 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3639 return 0; 3640 } 3641 3642 vmx_complete_nested_posted_interrupt(vcpu); 3643 return 0; 3644 } 3645 3646 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3647 { 3648 ktime_t remaining = 3649 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3650 u64 value; 3651 3652 if (ktime_to_ns(remaining) <= 0) 3653 return 0; 3654 3655 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3656 do_div(value, 1000000); 3657 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3658 } 3659 3660 static bool is_vmcs12_ext_field(unsigned long field) 3661 { 3662 switch (field) { 3663 case GUEST_ES_SELECTOR: 3664 case GUEST_CS_SELECTOR: 3665 case GUEST_SS_SELECTOR: 3666 case GUEST_DS_SELECTOR: 3667 case GUEST_FS_SELECTOR: 3668 case GUEST_GS_SELECTOR: 3669 case GUEST_LDTR_SELECTOR: 3670 case GUEST_TR_SELECTOR: 3671 case GUEST_ES_LIMIT: 3672 case GUEST_CS_LIMIT: 3673 case GUEST_SS_LIMIT: 3674 case GUEST_DS_LIMIT: 3675 case GUEST_FS_LIMIT: 3676 case GUEST_GS_LIMIT: 3677 case GUEST_LDTR_LIMIT: 3678 case GUEST_TR_LIMIT: 3679 case GUEST_GDTR_LIMIT: 3680 case GUEST_IDTR_LIMIT: 3681 case GUEST_ES_AR_BYTES: 3682 case GUEST_DS_AR_BYTES: 3683 case GUEST_FS_AR_BYTES: 3684 case GUEST_GS_AR_BYTES: 3685 case GUEST_LDTR_AR_BYTES: 3686 case GUEST_TR_AR_BYTES: 3687 case GUEST_ES_BASE: 3688 case GUEST_CS_BASE: 3689 case GUEST_SS_BASE: 3690 case GUEST_DS_BASE: 3691 case GUEST_FS_BASE: 3692 case GUEST_GS_BASE: 3693 case GUEST_LDTR_BASE: 3694 case GUEST_TR_BASE: 3695 case GUEST_GDTR_BASE: 3696 case GUEST_IDTR_BASE: 3697 case GUEST_PENDING_DBG_EXCEPTIONS: 3698 case GUEST_BNDCFGS: 3699 return true; 3700 default: 3701 break; 3702 } 3703 3704 return false; 3705 } 3706 3707 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3708 struct vmcs12 *vmcs12) 3709 { 3710 struct vcpu_vmx *vmx = to_vmx(vcpu); 3711 3712 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3713 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3714 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3715 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3716 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3717 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3718 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 3719 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 3720 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 3721 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 3722 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 3723 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 3724 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 3725 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 3726 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 3727 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 3728 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3729 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3730 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3731 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 3732 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 3733 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 3734 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 3735 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 3736 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 3737 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 3738 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 3739 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 3740 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 3741 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 3742 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 3743 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 3744 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 3745 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 3746 vmcs12->guest_pending_dbg_exceptions = 3747 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 3748 if (kvm_mpx_supported()) 3749 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3750 3751 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 3752 } 3753 3754 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3755 struct vmcs12 *vmcs12) 3756 { 3757 struct vcpu_vmx *vmx = to_vmx(vcpu); 3758 int cpu; 3759 3760 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 3761 return; 3762 3763 3764 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 3765 3766 cpu = get_cpu(); 3767 vmx->loaded_vmcs = &vmx->nested.vmcs02; 3768 vmx_vcpu_load(&vmx->vcpu, cpu); 3769 3770 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3771 3772 vmx->loaded_vmcs = &vmx->vmcs01; 3773 vmx_vcpu_load(&vmx->vcpu, cpu); 3774 put_cpu(); 3775 } 3776 3777 /* 3778 * Update the guest state fields of vmcs12 to reflect changes that 3779 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 3780 * VM-entry controls is also updated, since this is really a guest 3781 * state bit.) 3782 */ 3783 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3784 { 3785 struct vcpu_vmx *vmx = to_vmx(vcpu); 3786 3787 if (vmx->nested.hv_evmcs) 3788 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3789 3790 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; 3791 3792 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 3793 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 3794 3795 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 3796 vmcs12->guest_rip = kvm_rip_read(vcpu); 3797 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 3798 3799 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 3800 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 3801 3802 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 3803 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 3804 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 3805 3806 vmcs12->guest_interruptibility_info = 3807 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 3808 3809 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3810 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 3811 else 3812 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 3813 3814 if (nested_cpu_has_preemption_timer(vmcs12) && 3815 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) 3816 vmcs12->vmx_preemption_timer_value = 3817 vmx_get_preemption_timer_value(vcpu); 3818 3819 /* 3820 * In some cases (usually, nested EPT), L2 is allowed to change its 3821 * own CR3 without exiting. If it has changed it, we must keep it. 3822 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 3823 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 3824 * 3825 * Additionally, restore L2's PDPTR to vmcs12. 3826 */ 3827 if (enable_ept) { 3828 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 3829 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3830 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 3831 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 3832 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 3833 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 3834 } 3835 } 3836 3837 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 3838 3839 if (nested_cpu_has_vid(vmcs12)) 3840 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 3841 3842 vmcs12->vm_entry_controls = 3843 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 3844 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 3845 3846 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 3847 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 3848 3849 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 3850 vmcs12->guest_ia32_efer = vcpu->arch.efer; 3851 } 3852 3853 /* 3854 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 3855 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 3856 * and this function updates it to reflect the changes to the guest state while 3857 * L2 was running (and perhaps made some exits which were handled directly by L0 3858 * without going back to L1), and to reflect the exit reason. 3859 * Note that we do not have to copy here all VMCS fields, just those that 3860 * could have changed by the L2 guest or the exit - i.e., the guest-state and 3861 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 3862 * which already writes to vmcs12 directly. 3863 */ 3864 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 3865 u32 exit_reason, u32 exit_intr_info, 3866 unsigned long exit_qualification) 3867 { 3868 /* update exit information fields: */ 3869 vmcs12->vm_exit_reason = exit_reason; 3870 vmcs12->exit_qualification = exit_qualification; 3871 vmcs12->vm_exit_intr_info = exit_intr_info; 3872 3873 vmcs12->idt_vectoring_info_field = 0; 3874 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3875 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 3876 3877 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 3878 vmcs12->launch_state = 1; 3879 3880 /* vm_entry_intr_info_field is cleared on exit. Emulate this 3881 * instead of reading the real value. */ 3882 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 3883 3884 /* 3885 * Transfer the event that L0 or L1 may wanted to inject into 3886 * L2 to IDT_VECTORING_INFO_FIELD. 3887 */ 3888 vmcs12_save_pending_event(vcpu, vmcs12); 3889 3890 /* 3891 * According to spec, there's no need to store the guest's 3892 * MSRs if the exit is due to a VM-entry failure that occurs 3893 * during or after loading the guest state. Since this exit 3894 * does not fall in that category, we need to save the MSRs. 3895 */ 3896 if (nested_vmx_store_msr(vcpu, 3897 vmcs12->vm_exit_msr_store_addr, 3898 vmcs12->vm_exit_msr_store_count)) 3899 nested_vmx_abort(vcpu, 3900 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 3901 } 3902 3903 /* 3904 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 3905 * preserved above and would only end up incorrectly in L1. 3906 */ 3907 vcpu->arch.nmi_injected = false; 3908 kvm_clear_exception_queue(vcpu); 3909 kvm_clear_interrupt_queue(vcpu); 3910 } 3911 3912 /* 3913 * A part of what we need to when the nested L2 guest exits and we want to 3914 * run its L1 parent, is to reset L1's guest state to the host state specified 3915 * in vmcs12. 3916 * This function is to be called not only on normal nested exit, but also on 3917 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 3918 * Failures During or After Loading Guest State"). 3919 * This function should be called when the active VMCS is L1's (vmcs01). 3920 */ 3921 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3922 struct vmcs12 *vmcs12) 3923 { 3924 struct kvm_segment seg; 3925 u32 entry_failure_code; 3926 3927 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 3928 vcpu->arch.efer = vmcs12->host_ia32_efer; 3929 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 3930 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 3931 else 3932 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 3933 vmx_set_efer(vcpu, vcpu->arch.efer); 3934 3935 kvm_rsp_write(vcpu, vmcs12->host_rsp); 3936 kvm_rip_write(vcpu, vmcs12->host_rip); 3937 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 3938 vmx_set_interrupt_shadow(vcpu, 0); 3939 3940 /* 3941 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 3942 * actually changed, because vmx_set_cr0 refers to efer set above. 3943 * 3944 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 3945 * (KVM doesn't change it); 3946 */ 3947 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 3948 vmx_set_cr0(vcpu, vmcs12->host_cr0); 3949 3950 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 3951 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 3952 vmx_set_cr4(vcpu, vmcs12->host_cr4); 3953 3954 nested_ept_uninit_mmu_context(vcpu); 3955 3956 /* 3957 * Only PDPTE load can fail as the value of cr3 was checked on entry and 3958 * couldn't have changed. 3959 */ 3960 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) 3961 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 3962 3963 if (!enable_ept) 3964 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 3965 3966 /* 3967 * If vmcs01 doesn't use VPID, CPU flushes TLB on every 3968 * VMEntry/VMExit. Thus, no need to flush TLB. 3969 * 3970 * If vmcs12 doesn't use VPID, L1 expects TLB to be 3971 * flushed on every VMEntry/VMExit. 3972 * 3973 * Otherwise, we can preserve TLB entries as long as we are 3974 * able to tag L1 TLB entries differently than L2 TLB entries. 3975 * 3976 * If vmcs12 uses EPT, we need to execute this flush on EPTP01 3977 * and therefore we request the TLB flush to happen only after VMCS EPTP 3978 * has been set by KVM_REQ_LOAD_CR3. 3979 */ 3980 if (enable_vpid && 3981 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { 3982 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 3983 } 3984 3985 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 3986 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 3987 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 3988 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 3989 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 3990 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 3991 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 3992 3993 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 3994 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 3995 vmcs_write64(GUEST_BNDCFGS, 0); 3996 3997 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 3998 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 3999 vcpu->arch.pat = vmcs12->host_ia32_pat; 4000 } 4001 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 4002 SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4003 vmcs12->host_ia32_perf_global_ctrl); 4004 4005 /* Set L1 segment info according to Intel SDM 4006 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4007 seg = (struct kvm_segment) { 4008 .base = 0, 4009 .limit = 0xFFFFFFFF, 4010 .selector = vmcs12->host_cs_selector, 4011 .type = 11, 4012 .present = 1, 4013 .s = 1, 4014 .g = 1 4015 }; 4016 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4017 seg.l = 1; 4018 else 4019 seg.db = 1; 4020 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4021 seg = (struct kvm_segment) { 4022 .base = 0, 4023 .limit = 0xFFFFFFFF, 4024 .type = 3, 4025 .present = 1, 4026 .s = 1, 4027 .db = 1, 4028 .g = 1 4029 }; 4030 seg.selector = vmcs12->host_ds_selector; 4031 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4032 seg.selector = vmcs12->host_es_selector; 4033 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4034 seg.selector = vmcs12->host_ss_selector; 4035 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4036 seg.selector = vmcs12->host_fs_selector; 4037 seg.base = vmcs12->host_fs_base; 4038 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4039 seg.selector = vmcs12->host_gs_selector; 4040 seg.base = vmcs12->host_gs_base; 4041 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4042 seg = (struct kvm_segment) { 4043 .base = vmcs12->host_tr_base, 4044 .limit = 0x67, 4045 .selector = vmcs12->host_tr_selector, 4046 .type = 11, 4047 .present = 1 4048 }; 4049 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4050 4051 kvm_set_dr(vcpu, 7, 0x400); 4052 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4053 4054 if (cpu_has_vmx_msr_bitmap()) 4055 vmx_update_msr_bitmap(vcpu); 4056 4057 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4058 vmcs12->vm_exit_msr_load_count)) 4059 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4060 } 4061 4062 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4063 { 4064 struct shared_msr_entry *efer_msr; 4065 unsigned int i; 4066 4067 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4068 return vmcs_read64(GUEST_IA32_EFER); 4069 4070 if (cpu_has_load_ia32_efer()) 4071 return host_efer; 4072 4073 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4074 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4075 return vmx->msr_autoload.guest.val[i].value; 4076 } 4077 4078 efer_msr = find_msr_entry(vmx, MSR_EFER); 4079 if (efer_msr) 4080 return efer_msr->data; 4081 4082 return host_efer; 4083 } 4084 4085 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4086 { 4087 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4088 struct vcpu_vmx *vmx = to_vmx(vcpu); 4089 struct vmx_msr_entry g, h; 4090 gpa_t gpa; 4091 u32 i, j; 4092 4093 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4094 4095 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4096 /* 4097 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4098 * as vmcs01.GUEST_DR7 contains a userspace defined value 4099 * and vcpu->arch.dr7 is not squirreled away before the 4100 * nested VMENTER (not worth adding a variable in nested_vmx). 4101 */ 4102 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4103 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4104 else 4105 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4106 } 4107 4108 /* 4109 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4110 * handle a variety of side effects to KVM's software model. 4111 */ 4112 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4113 4114 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 4115 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4116 4117 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4118 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4119 4120 nested_ept_uninit_mmu_context(vcpu); 4121 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4122 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4123 4124 /* 4125 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4126 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4127 * VMFail, like everything else we just need to ensure our 4128 * software model is up-to-date. 4129 */ 4130 if (enable_ept) 4131 ept_save_pdptrs(vcpu); 4132 4133 kvm_mmu_reset_context(vcpu); 4134 4135 if (cpu_has_vmx_msr_bitmap()) 4136 vmx_update_msr_bitmap(vcpu); 4137 4138 /* 4139 * This nasty bit of open coding is a compromise between blindly 4140 * loading L1's MSRs using the exit load lists (incorrect emulation 4141 * of VMFail), leaving the nested VM's MSRs in the software model 4142 * (incorrect behavior) and snapshotting the modified MSRs (too 4143 * expensive since the lists are unbound by hardware). For each 4144 * MSR that was (prematurely) loaded from the nested VMEntry load 4145 * list, reload it from the exit load list if it exists and differs 4146 * from the guest value. The intent is to stuff host state as 4147 * silently as possible, not to fully process the exit load list. 4148 */ 4149 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4150 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4151 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4152 pr_debug_ratelimited( 4153 "%s read MSR index failed (%u, 0x%08llx)\n", 4154 __func__, i, gpa); 4155 goto vmabort; 4156 } 4157 4158 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4159 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4160 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4161 pr_debug_ratelimited( 4162 "%s read MSR failed (%u, 0x%08llx)\n", 4163 __func__, j, gpa); 4164 goto vmabort; 4165 } 4166 if (h.index != g.index) 4167 continue; 4168 if (h.value == g.value) 4169 break; 4170 4171 if (nested_vmx_load_msr_check(vcpu, &h)) { 4172 pr_debug_ratelimited( 4173 "%s check failed (%u, 0x%x, 0x%x)\n", 4174 __func__, j, h.index, h.reserved); 4175 goto vmabort; 4176 } 4177 4178 if (kvm_set_msr(vcpu, h.index, h.value)) { 4179 pr_debug_ratelimited( 4180 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4181 __func__, j, h.index, h.value); 4182 goto vmabort; 4183 } 4184 } 4185 } 4186 4187 return; 4188 4189 vmabort: 4190 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4191 } 4192 4193 /* 4194 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4195 * and modify vmcs12 to make it see what it would expect to see there if 4196 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4197 */ 4198 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 4199 u32 exit_intr_info, unsigned long exit_qualification) 4200 { 4201 struct vcpu_vmx *vmx = to_vmx(vcpu); 4202 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4203 4204 /* trying to cancel vmlaunch/vmresume is a bug */ 4205 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4206 4207 leave_guest_mode(vcpu); 4208 4209 if (nested_cpu_has_preemption_timer(vmcs12)) 4210 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4211 4212 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 4213 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4214 4215 if (likely(!vmx->fail)) { 4216 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4217 4218 if (exit_reason != -1) 4219 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 4220 exit_qualification); 4221 4222 /* 4223 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4224 * also be used to capture vmcs12 cache as part of 4225 * capturing nVMX state for snapshot (migration). 4226 * 4227 * Otherwise, this flush will dirty guest memory at a 4228 * point it is already assumed by user-space to be 4229 * immutable. 4230 */ 4231 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4232 } else { 4233 /* 4234 * The only expected VM-instruction error is "VM entry with 4235 * invalid control field(s)." Anything else indicates a 4236 * problem with L0. And we should never get here with a 4237 * VMFail of any type if early consistency checks are enabled. 4238 */ 4239 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4240 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4241 WARN_ON_ONCE(nested_early_check); 4242 } 4243 4244 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4245 4246 /* Update any VMCS fields that might have changed while L2 ran */ 4247 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4248 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4249 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4250 if (vmx->nested.l1_tpr_threshold != -1) 4251 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4252 4253 if (kvm_has_tsc_control) 4254 decache_tsc_multiplier(vmx); 4255 4256 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4257 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4258 vmx_set_virtual_apic_mode(vcpu); 4259 } 4260 4261 /* Unpin physical memory we referred to in vmcs02 */ 4262 if (vmx->nested.apic_access_page) { 4263 kvm_release_page_clean(vmx->nested.apic_access_page); 4264 vmx->nested.apic_access_page = NULL; 4265 } 4266 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4267 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4268 vmx->nested.pi_desc = NULL; 4269 4270 /* 4271 * We are now running in L2, mmu_notifier will force to reload the 4272 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. 4273 */ 4274 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4275 4276 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) 4277 vmx->nested.need_vmcs12_to_shadow_sync = true; 4278 4279 /* in case we halted in L2 */ 4280 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4281 4282 if (likely(!vmx->fail)) { 4283 /* 4284 * TODO: SDM says that with acknowledge interrupt on 4285 * exit, bit 31 of the VM-exit interrupt information 4286 * (valid interrupt) is always set to 1 on 4287 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't 4288 * need kvm_cpu_has_interrupt(). See the commit 4289 * message for details. 4290 */ 4291 if (nested_exit_intr_ack_set(vcpu) && 4292 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4293 kvm_cpu_has_interrupt(vcpu)) { 4294 int irq = kvm_cpu_get_interrupt(vcpu); 4295 WARN_ON(irq < 0); 4296 vmcs12->vm_exit_intr_info = irq | 4297 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4298 } 4299 4300 if (exit_reason != -1) 4301 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4302 vmcs12->exit_qualification, 4303 vmcs12->idt_vectoring_info_field, 4304 vmcs12->vm_exit_intr_info, 4305 vmcs12->vm_exit_intr_error_code, 4306 KVM_ISA_VMX); 4307 4308 load_vmcs12_host_state(vcpu, vmcs12); 4309 4310 return; 4311 } 4312 4313 /* 4314 * After an early L2 VM-entry failure, we're now back 4315 * in L1 which thinks it just finished a VMLAUNCH or 4316 * VMRESUME instruction, so we need to set the failure 4317 * flag and the VM-instruction error field of the VMCS 4318 * accordingly, and skip the emulated instruction. 4319 */ 4320 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4321 4322 /* 4323 * Restore L1's host state to KVM's software model. We're here 4324 * because a consistency check was caught by hardware, which 4325 * means some amount of guest state has been propagated to KVM's 4326 * model and needs to be unwound to the host's state. 4327 */ 4328 nested_vmx_restore_host_state(vcpu); 4329 4330 vmx->fail = 0; 4331 } 4332 4333 /* 4334 * Decode the memory-address operand of a vmx instruction, as recorded on an 4335 * exit caused by such an instruction (run by a guest hypervisor). 4336 * On success, returns 0. When the operand is invalid, returns 1 and throws 4337 * #UD or #GP. 4338 */ 4339 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4340 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4341 { 4342 gva_t off; 4343 bool exn; 4344 struct kvm_segment s; 4345 4346 /* 4347 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4348 * Execution", on an exit, vmx_instruction_info holds most of the 4349 * addressing components of the operand. Only the displacement part 4350 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4351 * For how an actual address is calculated from all these components, 4352 * refer to Vol. 1, "Operand Addressing". 4353 */ 4354 int scaling = vmx_instruction_info & 3; 4355 int addr_size = (vmx_instruction_info >> 7) & 7; 4356 bool is_reg = vmx_instruction_info & (1u << 10); 4357 int seg_reg = (vmx_instruction_info >> 15) & 7; 4358 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4359 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4360 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4361 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4362 4363 if (is_reg) { 4364 kvm_queue_exception(vcpu, UD_VECTOR); 4365 return 1; 4366 } 4367 4368 /* Addr = segment_base + offset */ 4369 /* offset = base + [index * scale] + displacement */ 4370 off = exit_qualification; /* holds the displacement */ 4371 if (addr_size == 1) 4372 off = (gva_t)sign_extend64(off, 31); 4373 else if (addr_size == 0) 4374 off = (gva_t)sign_extend64(off, 15); 4375 if (base_is_valid) 4376 off += kvm_register_read(vcpu, base_reg); 4377 if (index_is_valid) 4378 off += kvm_register_read(vcpu, index_reg)<<scaling; 4379 vmx_get_segment(vcpu, &s, seg_reg); 4380 4381 /* 4382 * The effective address, i.e. @off, of a memory operand is truncated 4383 * based on the address size of the instruction. Note that this is 4384 * the *effective address*, i.e. the address prior to accounting for 4385 * the segment's base. 4386 */ 4387 if (addr_size == 1) /* 32 bit */ 4388 off &= 0xffffffff; 4389 else if (addr_size == 0) /* 16 bit */ 4390 off &= 0xffff; 4391 4392 /* Checks for #GP/#SS exceptions. */ 4393 exn = false; 4394 if (is_long_mode(vcpu)) { 4395 /* 4396 * The virtual/linear address is never truncated in 64-bit 4397 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4398 * address when using FS/GS with a non-zero base. 4399 */ 4400 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4401 *ret = s.base + off; 4402 else 4403 *ret = off; 4404 4405 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4406 * non-canonical form. This is the only check on the memory 4407 * destination for long mode! 4408 */ 4409 exn = is_noncanonical_address(*ret, vcpu); 4410 } else { 4411 /* 4412 * When not in long mode, the virtual/linear address is 4413 * unconditionally truncated to 32 bits regardless of the 4414 * address size. 4415 */ 4416 *ret = (s.base + off) & 0xffffffff; 4417 4418 /* Protected mode: apply checks for segment validity in the 4419 * following order: 4420 * - segment type check (#GP(0) may be thrown) 4421 * - usability check (#GP(0)/#SS(0)) 4422 * - limit check (#GP(0)/#SS(0)) 4423 */ 4424 if (wr) 4425 /* #GP(0) if the destination operand is located in a 4426 * read-only data segment or any code segment. 4427 */ 4428 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4429 else 4430 /* #GP(0) if the source operand is located in an 4431 * execute-only code segment 4432 */ 4433 exn = ((s.type & 0xa) == 8); 4434 if (exn) { 4435 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4436 return 1; 4437 } 4438 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4439 */ 4440 exn = (s.unusable != 0); 4441 4442 /* 4443 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4444 * outside the segment limit. All CPUs that support VMX ignore 4445 * limit checks for flat segments, i.e. segments with base==0, 4446 * limit==0xffffffff and of type expand-up data or code. 4447 */ 4448 if (!(s.base == 0 && s.limit == 0xffffffff && 4449 ((s.type & 8) || !(s.type & 4)))) 4450 exn = exn || ((u64)off + len - 1 > s.limit); 4451 } 4452 if (exn) { 4453 kvm_queue_exception_e(vcpu, 4454 seg_reg == VCPU_SREG_SS ? 4455 SS_VECTOR : GP_VECTOR, 4456 0); 4457 return 1; 4458 } 4459 4460 return 0; 4461 } 4462 4463 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 4464 { 4465 struct vcpu_vmx *vmx; 4466 4467 if (!nested_vmx_allowed(vcpu)) 4468 return; 4469 4470 vmx = to_vmx(vcpu); 4471 if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { 4472 vmx->nested.msrs.entry_ctls_high |= 4473 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4474 vmx->nested.msrs.exit_ctls_high |= 4475 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4476 } else { 4477 vmx->nested.msrs.entry_ctls_high &= 4478 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4479 vmx->nested.msrs.exit_ctls_high &= 4480 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4481 } 4482 } 4483 4484 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) 4485 { 4486 gva_t gva; 4487 struct x86_exception e; 4488 4489 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4490 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4491 sizeof(*vmpointer), &gva)) 4492 return 1; 4493 4494 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { 4495 kvm_inject_page_fault(vcpu, &e); 4496 return 1; 4497 } 4498 4499 return 0; 4500 } 4501 4502 /* 4503 * Allocate a shadow VMCS and associate it with the currently loaded 4504 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4505 * VMCS is also VMCLEARed, so that it is ready for use. 4506 */ 4507 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4508 { 4509 struct vcpu_vmx *vmx = to_vmx(vcpu); 4510 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4511 4512 /* 4513 * We should allocate a shadow vmcs for vmcs01 only when L1 4514 * executes VMXON and free it when L1 executes VMXOFF. 4515 * As it is invalid to execute VMXON twice, we shouldn't reach 4516 * here when vmcs01 already have an allocated shadow vmcs. 4517 */ 4518 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4519 4520 if (!loaded_vmcs->shadow_vmcs) { 4521 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4522 if (loaded_vmcs->shadow_vmcs) 4523 vmcs_clear(loaded_vmcs->shadow_vmcs); 4524 } 4525 return loaded_vmcs->shadow_vmcs; 4526 } 4527 4528 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4529 { 4530 struct vcpu_vmx *vmx = to_vmx(vcpu); 4531 int r; 4532 4533 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4534 if (r < 0) 4535 goto out_vmcs02; 4536 4537 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4538 if (!vmx->nested.cached_vmcs12) 4539 goto out_cached_vmcs12; 4540 4541 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4542 if (!vmx->nested.cached_shadow_vmcs12) 4543 goto out_cached_shadow_vmcs12; 4544 4545 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4546 goto out_shadow_vmcs; 4547 4548 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4549 HRTIMER_MODE_REL_PINNED); 4550 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4551 4552 vmx->nested.vpid02 = allocate_vpid(); 4553 4554 vmx->nested.vmcs02_initialized = false; 4555 vmx->nested.vmxon = true; 4556 4557 if (pt_mode == PT_MODE_HOST_GUEST) { 4558 vmx->pt_desc.guest.ctl = 0; 4559 pt_update_intercept_for_msr(vmx); 4560 } 4561 4562 return 0; 4563 4564 out_shadow_vmcs: 4565 kfree(vmx->nested.cached_shadow_vmcs12); 4566 4567 out_cached_shadow_vmcs12: 4568 kfree(vmx->nested.cached_vmcs12); 4569 4570 out_cached_vmcs12: 4571 free_loaded_vmcs(&vmx->nested.vmcs02); 4572 4573 out_vmcs02: 4574 return -ENOMEM; 4575 } 4576 4577 /* 4578 * Emulate the VMXON instruction. 4579 * Currently, we just remember that VMX is active, and do not save or even 4580 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4581 * do not currently need to store anything in that guest-allocated memory 4582 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4583 * argument is different from the VMXON pointer (which the spec says they do). 4584 */ 4585 static int handle_vmon(struct kvm_vcpu *vcpu) 4586 { 4587 int ret; 4588 gpa_t vmptr; 4589 uint32_t revision; 4590 struct vcpu_vmx *vmx = to_vmx(vcpu); 4591 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 4592 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 4593 4594 /* 4595 * The Intel VMX Instruction Reference lists a bunch of bits that are 4596 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4597 * 1 (see vmx_set_cr4() for when we allow the guest to set this). 4598 * Otherwise, we should fail with #UD. But most faulting conditions 4599 * have already been checked by hardware, prior to the VM-exit for 4600 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4601 * that bit set to 1 in non-root mode. 4602 */ 4603 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4604 kvm_queue_exception(vcpu, UD_VECTOR); 4605 return 1; 4606 } 4607 4608 /* CPL=0 must be checked manually. */ 4609 if (vmx_get_cpl(vcpu)) { 4610 kvm_inject_gp(vcpu, 0); 4611 return 1; 4612 } 4613 4614 if (vmx->nested.vmxon) 4615 return nested_vmx_failValid(vcpu, 4616 VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4617 4618 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4619 != VMXON_NEEDED_FEATURES) { 4620 kvm_inject_gp(vcpu, 0); 4621 return 1; 4622 } 4623 4624 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4625 return 1; 4626 4627 /* 4628 * SDM 3: 24.11.5 4629 * The first 4 bytes of VMXON region contain the supported 4630 * VMCS revision identifier 4631 * 4632 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4633 * which replaces physical address width with 32 4634 */ 4635 if (!page_address_valid(vcpu, vmptr)) 4636 return nested_vmx_failInvalid(vcpu); 4637 4638 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4639 revision != VMCS12_REVISION) 4640 return nested_vmx_failInvalid(vcpu); 4641 4642 vmx->nested.vmxon_ptr = vmptr; 4643 ret = enter_vmx_operation(vcpu); 4644 if (ret) 4645 return ret; 4646 4647 return nested_vmx_succeed(vcpu); 4648 } 4649 4650 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4651 { 4652 struct vcpu_vmx *vmx = to_vmx(vcpu); 4653 4654 if (vmx->nested.current_vmptr == -1ull) 4655 return; 4656 4657 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4658 4659 if (enable_shadow_vmcs) { 4660 /* copy to memory all shadowed fields in case 4661 they were modified */ 4662 copy_shadow_to_vmcs12(vmx); 4663 vmx_disable_shadow_vmcs(vmx); 4664 } 4665 vmx->nested.posted_intr_nv = -1; 4666 4667 /* Flush VMCS12 to guest memory */ 4668 kvm_vcpu_write_guest_page(vcpu, 4669 vmx->nested.current_vmptr >> PAGE_SHIFT, 4670 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4671 4672 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4673 4674 vmx->nested.current_vmptr = -1ull; 4675 } 4676 4677 /* Emulate the VMXOFF instruction */ 4678 static int handle_vmoff(struct kvm_vcpu *vcpu) 4679 { 4680 if (!nested_vmx_check_permission(vcpu)) 4681 return 1; 4682 4683 free_nested(vcpu); 4684 4685 /* Process a latched INIT during time CPU was in VMX operation */ 4686 kvm_make_request(KVM_REQ_EVENT, vcpu); 4687 4688 return nested_vmx_succeed(vcpu); 4689 } 4690 4691 /* Emulate the VMCLEAR instruction */ 4692 static int handle_vmclear(struct kvm_vcpu *vcpu) 4693 { 4694 struct vcpu_vmx *vmx = to_vmx(vcpu); 4695 u32 zero = 0; 4696 gpa_t vmptr; 4697 u64 evmcs_gpa; 4698 4699 if (!nested_vmx_check_permission(vcpu)) 4700 return 1; 4701 4702 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4703 return 1; 4704 4705 if (!page_address_valid(vcpu, vmptr)) 4706 return nested_vmx_failValid(vcpu, 4707 VMXERR_VMCLEAR_INVALID_ADDRESS); 4708 4709 if (vmptr == vmx->nested.vmxon_ptr) 4710 return nested_vmx_failValid(vcpu, 4711 VMXERR_VMCLEAR_VMXON_POINTER); 4712 4713 /* 4714 * When Enlightened VMEntry is enabled on the calling CPU we treat 4715 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 4716 * way to distinguish it from VMCS12) and we must not corrupt it by 4717 * writing to the non-existent 'launch_state' field. The area doesn't 4718 * have to be the currently active EVMCS on the calling CPU and there's 4719 * nothing KVM has to do to transition it from 'active' to 'non-active' 4720 * state. It is possible that the area will stay mapped as 4721 * vmx->nested.hv_evmcs but this shouldn't be a problem. 4722 */ 4723 if (likely(!vmx->nested.enlightened_vmcs_enabled || 4724 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 4725 if (vmptr == vmx->nested.current_vmptr) 4726 nested_release_vmcs12(vcpu); 4727 4728 kvm_vcpu_write_guest(vcpu, 4729 vmptr + offsetof(struct vmcs12, 4730 launch_state), 4731 &zero, sizeof(zero)); 4732 } 4733 4734 return nested_vmx_succeed(vcpu); 4735 } 4736 4737 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); 4738 4739 /* Emulate the VMLAUNCH instruction */ 4740 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 4741 { 4742 return nested_vmx_run(vcpu, true); 4743 } 4744 4745 /* Emulate the VMRESUME instruction */ 4746 static int handle_vmresume(struct kvm_vcpu *vcpu) 4747 { 4748 4749 return nested_vmx_run(vcpu, false); 4750 } 4751 4752 static int handle_vmread(struct kvm_vcpu *vcpu) 4753 { 4754 unsigned long field; 4755 u64 field_value; 4756 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4757 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4758 int len; 4759 gva_t gva = 0; 4760 struct vmcs12 *vmcs12; 4761 struct x86_exception e; 4762 short offset; 4763 4764 if (!nested_vmx_check_permission(vcpu)) 4765 return 1; 4766 4767 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) 4768 return nested_vmx_failInvalid(vcpu); 4769 4770 if (!is_guest_mode(vcpu)) 4771 vmcs12 = get_vmcs12(vcpu); 4772 else { 4773 /* 4774 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD 4775 * to shadowed-field sets the ALU flags for VMfailInvalid. 4776 */ 4777 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4778 return nested_vmx_failInvalid(vcpu); 4779 vmcs12 = get_shadow_vmcs12(vcpu); 4780 } 4781 4782 /* Decode instruction info and find the field to read */ 4783 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4784 4785 offset = vmcs_field_to_offset(field); 4786 if (offset < 0) 4787 return nested_vmx_failValid(vcpu, 4788 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4789 4790 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 4791 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4792 4793 /* Read the field, zero-extended to a u64 field_value */ 4794 field_value = vmcs12_read_any(vmcs12, field, offset); 4795 4796 /* 4797 * Now copy part of this value to register or memory, as requested. 4798 * Note that the number of bits actually copied is 32 or 64 depending 4799 * on the guest's mode (32 or 64 bit), not on the given field's length. 4800 */ 4801 if (vmx_instruction_info & (1u << 10)) { 4802 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 4803 field_value); 4804 } else { 4805 len = is_64_bit_mode(vcpu) ? 8 : 4; 4806 if (get_vmx_mem_address(vcpu, exit_qualification, 4807 vmx_instruction_info, true, len, &gva)) 4808 return 1; 4809 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4810 if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) 4811 kvm_inject_page_fault(vcpu, &e); 4812 } 4813 4814 return nested_vmx_succeed(vcpu); 4815 } 4816 4817 static bool is_shadow_field_rw(unsigned long field) 4818 { 4819 switch (field) { 4820 #define SHADOW_FIELD_RW(x, y) case x: 4821 #include "vmcs_shadow_fields.h" 4822 return true; 4823 default: 4824 break; 4825 } 4826 return false; 4827 } 4828 4829 static bool is_shadow_field_ro(unsigned long field) 4830 { 4831 switch (field) { 4832 #define SHADOW_FIELD_RO(x, y) case x: 4833 #include "vmcs_shadow_fields.h" 4834 return true; 4835 default: 4836 break; 4837 } 4838 return false; 4839 } 4840 4841 static int handle_vmwrite(struct kvm_vcpu *vcpu) 4842 { 4843 unsigned long field; 4844 int len; 4845 gva_t gva; 4846 struct vcpu_vmx *vmx = to_vmx(vcpu); 4847 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4848 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4849 4850 /* The value to write might be 32 or 64 bits, depending on L1's long 4851 * mode, and eventually we need to write that into a field of several 4852 * possible lengths. The code below first zero-extends the value to 64 4853 * bit (field_value), and then copies only the appropriate number of 4854 * bits into the vmcs12 field. 4855 */ 4856 u64 field_value = 0; 4857 struct x86_exception e; 4858 struct vmcs12 *vmcs12; 4859 short offset; 4860 4861 if (!nested_vmx_check_permission(vcpu)) 4862 return 1; 4863 4864 if (vmx->nested.current_vmptr == -1ull) 4865 return nested_vmx_failInvalid(vcpu); 4866 4867 if (vmx_instruction_info & (1u << 10)) 4868 field_value = kvm_register_readl(vcpu, 4869 (((vmx_instruction_info) >> 3) & 0xf)); 4870 else { 4871 len = is_64_bit_mode(vcpu) ? 8 : 4; 4872 if (get_vmx_mem_address(vcpu, exit_qualification, 4873 vmx_instruction_info, false, len, &gva)) 4874 return 1; 4875 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { 4876 kvm_inject_page_fault(vcpu, &e); 4877 return 1; 4878 } 4879 } 4880 4881 4882 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4883 /* 4884 * If the vCPU supports "VMWRITE to any supported field in the 4885 * VMCS," then the "read-only" fields are actually read/write. 4886 */ 4887 if (vmcs_field_readonly(field) && 4888 !nested_cpu_has_vmwrite_any_field(vcpu)) 4889 return nested_vmx_failValid(vcpu, 4890 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 4891 4892 if (!is_guest_mode(vcpu)) { 4893 vmcs12 = get_vmcs12(vcpu); 4894 4895 /* 4896 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 4897 * vmcs12, else we may crush a field or consume a stale value. 4898 */ 4899 if (!is_shadow_field_rw(field)) 4900 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4901 } else { 4902 /* 4903 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE 4904 * to shadowed-field sets the ALU flags for VMfailInvalid. 4905 */ 4906 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4907 return nested_vmx_failInvalid(vcpu); 4908 vmcs12 = get_shadow_vmcs12(vcpu); 4909 } 4910 4911 offset = vmcs_field_to_offset(field); 4912 if (offset < 0) 4913 return nested_vmx_failValid(vcpu, 4914 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4915 4916 /* 4917 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 4918 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 4919 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 4920 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 4921 * from L1 will return a different value than VMREAD from L2 (L1 sees 4922 * the stripped down value, L2 sees the full value as stored by KVM). 4923 */ 4924 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 4925 field_value &= 0x1f0ff; 4926 4927 vmcs12_write_any(vmcs12, field, offset, field_value); 4928 4929 /* 4930 * Do not track vmcs12 dirty-state if in guest-mode as we actually 4931 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 4932 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 4933 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 4934 */ 4935 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 4936 /* 4937 * L1 can read these fields without exiting, ensure the 4938 * shadow VMCS is up-to-date. 4939 */ 4940 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 4941 preempt_disable(); 4942 vmcs_load(vmx->vmcs01.shadow_vmcs); 4943 4944 __vmcs_writel(field, field_value); 4945 4946 vmcs_clear(vmx->vmcs01.shadow_vmcs); 4947 vmcs_load(vmx->loaded_vmcs->vmcs); 4948 preempt_enable(); 4949 } 4950 vmx->nested.dirty_vmcs12 = true; 4951 } 4952 4953 return nested_vmx_succeed(vcpu); 4954 } 4955 4956 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 4957 { 4958 vmx->nested.current_vmptr = vmptr; 4959 if (enable_shadow_vmcs) { 4960 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 4961 vmcs_write64(VMCS_LINK_POINTER, 4962 __pa(vmx->vmcs01.shadow_vmcs)); 4963 vmx->nested.need_vmcs12_to_shadow_sync = true; 4964 } 4965 vmx->nested.dirty_vmcs12 = true; 4966 } 4967 4968 /* Emulate the VMPTRLD instruction */ 4969 static int handle_vmptrld(struct kvm_vcpu *vcpu) 4970 { 4971 struct vcpu_vmx *vmx = to_vmx(vcpu); 4972 gpa_t vmptr; 4973 4974 if (!nested_vmx_check_permission(vcpu)) 4975 return 1; 4976 4977 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4978 return 1; 4979 4980 if (!page_address_valid(vcpu, vmptr)) 4981 return nested_vmx_failValid(vcpu, 4982 VMXERR_VMPTRLD_INVALID_ADDRESS); 4983 4984 if (vmptr == vmx->nested.vmxon_ptr) 4985 return nested_vmx_failValid(vcpu, 4986 VMXERR_VMPTRLD_VMXON_POINTER); 4987 4988 /* Forbid normal VMPTRLD if Enlightened version was used */ 4989 if (vmx->nested.hv_evmcs) 4990 return 1; 4991 4992 if (vmx->nested.current_vmptr != vmptr) { 4993 struct kvm_host_map map; 4994 struct vmcs12 *new_vmcs12; 4995 4996 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 4997 /* 4998 * Reads from an unbacked page return all 1s, 4999 * which means that the 32 bits located at the 5000 * given physical address won't match the required 5001 * VMCS12_REVISION identifier. 5002 */ 5003 return nested_vmx_failValid(vcpu, 5004 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5005 } 5006 5007 new_vmcs12 = map.hva; 5008 5009 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 5010 (new_vmcs12->hdr.shadow_vmcs && 5011 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5012 kvm_vcpu_unmap(vcpu, &map, false); 5013 return nested_vmx_failValid(vcpu, 5014 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5015 } 5016 5017 nested_release_vmcs12(vcpu); 5018 5019 /* 5020 * Load VMCS12 from guest memory since it is not already 5021 * cached. 5022 */ 5023 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 5024 kvm_vcpu_unmap(vcpu, &map, false); 5025 5026 set_current_vmptr(vmx, vmptr); 5027 } 5028 5029 return nested_vmx_succeed(vcpu); 5030 } 5031 5032 /* Emulate the VMPTRST instruction */ 5033 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5034 { 5035 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); 5036 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5037 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5038 struct x86_exception e; 5039 gva_t gva; 5040 5041 if (!nested_vmx_check_permission(vcpu)) 5042 return 1; 5043 5044 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) 5045 return 1; 5046 5047 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5048 true, sizeof(gpa_t), &gva)) 5049 return 1; 5050 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5051 if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5052 sizeof(gpa_t), &e)) { 5053 kvm_inject_page_fault(vcpu, &e); 5054 return 1; 5055 } 5056 return nested_vmx_succeed(vcpu); 5057 } 5058 5059 /* Emulate the INVEPT instruction */ 5060 static int handle_invept(struct kvm_vcpu *vcpu) 5061 { 5062 struct vcpu_vmx *vmx = to_vmx(vcpu); 5063 u32 vmx_instruction_info, types; 5064 unsigned long type; 5065 gva_t gva; 5066 struct x86_exception e; 5067 struct { 5068 u64 eptp, gpa; 5069 } operand; 5070 5071 if (!(vmx->nested.msrs.secondary_ctls_high & 5072 SECONDARY_EXEC_ENABLE_EPT) || 5073 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5074 kvm_queue_exception(vcpu, UD_VECTOR); 5075 return 1; 5076 } 5077 5078 if (!nested_vmx_check_permission(vcpu)) 5079 return 1; 5080 5081 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5082 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 5083 5084 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5085 5086 if (type >= 32 || !(types & (1 << type))) 5087 return nested_vmx_failValid(vcpu, 5088 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5089 5090 /* According to the Intel VMX instruction reference, the memory 5091 * operand is read even if it isn't needed (e.g., for type==global) 5092 */ 5093 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 5094 vmx_instruction_info, false, sizeof(operand), &gva)) 5095 return 1; 5096 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 5097 kvm_inject_page_fault(vcpu, &e); 5098 return 1; 5099 } 5100 5101 switch (type) { 5102 case VMX_EPT_EXTENT_GLOBAL: 5103 case VMX_EPT_EXTENT_CONTEXT: 5104 /* 5105 * TODO: Sync the necessary shadow EPT roots here, rather than 5106 * at the next emulated VM-entry. 5107 */ 5108 break; 5109 default: 5110 BUG_ON(1); 5111 break; 5112 } 5113 5114 return nested_vmx_succeed(vcpu); 5115 } 5116 5117 static int handle_invvpid(struct kvm_vcpu *vcpu) 5118 { 5119 struct vcpu_vmx *vmx = to_vmx(vcpu); 5120 u32 vmx_instruction_info; 5121 unsigned long type, types; 5122 gva_t gva; 5123 struct x86_exception e; 5124 struct { 5125 u64 vpid; 5126 u64 gla; 5127 } operand; 5128 u16 vpid02; 5129 5130 if (!(vmx->nested.msrs.secondary_ctls_high & 5131 SECONDARY_EXEC_ENABLE_VPID) || 5132 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5133 kvm_queue_exception(vcpu, UD_VECTOR); 5134 return 1; 5135 } 5136 5137 if (!nested_vmx_check_permission(vcpu)) 5138 return 1; 5139 5140 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5141 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 5142 5143 types = (vmx->nested.msrs.vpid_caps & 5144 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5145 5146 if (type >= 32 || !(types & (1 << type))) 5147 return nested_vmx_failValid(vcpu, 5148 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5149 5150 /* according to the intel vmx instruction reference, the memory 5151 * operand is read even if it isn't needed (e.g., for type==global) 5152 */ 5153 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 5154 vmx_instruction_info, false, sizeof(operand), &gva)) 5155 return 1; 5156 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 5157 kvm_inject_page_fault(vcpu, &e); 5158 return 1; 5159 } 5160 if (operand.vpid >> 16) 5161 return nested_vmx_failValid(vcpu, 5162 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5163 5164 vpid02 = nested_get_vpid02(vcpu); 5165 switch (type) { 5166 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5167 if (!operand.vpid || 5168 is_noncanonical_address(operand.gla, vcpu)) 5169 return nested_vmx_failValid(vcpu, 5170 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5171 if (cpu_has_vmx_invvpid_individual_addr()) { 5172 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, 5173 vpid02, operand.gla); 5174 } else 5175 __vmx_flush_tlb(vcpu, vpid02, false); 5176 break; 5177 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5178 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5179 if (!operand.vpid) 5180 return nested_vmx_failValid(vcpu, 5181 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5182 __vmx_flush_tlb(vcpu, vpid02, false); 5183 break; 5184 case VMX_VPID_EXTENT_ALL_CONTEXT: 5185 __vmx_flush_tlb(vcpu, vpid02, false); 5186 break; 5187 default: 5188 WARN_ON_ONCE(1); 5189 return kvm_skip_emulated_instruction(vcpu); 5190 } 5191 5192 return nested_vmx_succeed(vcpu); 5193 } 5194 5195 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5196 struct vmcs12 *vmcs12) 5197 { 5198 u32 index = kvm_rcx_read(vcpu); 5199 u64 address; 5200 bool accessed_dirty; 5201 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 5202 5203 if (!nested_cpu_has_eptp_switching(vmcs12) || 5204 !nested_cpu_has_ept(vmcs12)) 5205 return 1; 5206 5207 if (index >= VMFUNC_EPTP_ENTRIES) 5208 return 1; 5209 5210 5211 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5212 &address, index * 8, 8)) 5213 return 1; 5214 5215 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); 5216 5217 /* 5218 * If the (L2) guest does a vmfunc to the currently 5219 * active ept pointer, we don't have to do anything else 5220 */ 5221 if (vmcs12->ept_pointer != address) { 5222 if (!valid_ept_address(vcpu, address)) 5223 return 1; 5224 5225 kvm_mmu_unload(vcpu); 5226 mmu->ept_ad = accessed_dirty; 5227 mmu->mmu_role.base.ad_disabled = !accessed_dirty; 5228 vmcs12->ept_pointer = address; 5229 /* 5230 * TODO: Check what's the correct approach in case 5231 * mmu reload fails. Currently, we just let the next 5232 * reload potentially fail 5233 */ 5234 kvm_mmu_reload(vcpu); 5235 } 5236 5237 return 0; 5238 } 5239 5240 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5241 { 5242 struct vcpu_vmx *vmx = to_vmx(vcpu); 5243 struct vmcs12 *vmcs12; 5244 u32 function = kvm_rax_read(vcpu); 5245 5246 /* 5247 * VMFUNC is only supported for nested guests, but we always enable the 5248 * secondary control for simplicity; for non-nested mode, fake that we 5249 * didn't by injecting #UD. 5250 */ 5251 if (!is_guest_mode(vcpu)) { 5252 kvm_queue_exception(vcpu, UD_VECTOR); 5253 return 1; 5254 } 5255 5256 vmcs12 = get_vmcs12(vcpu); 5257 if ((vmcs12->vm_function_control & (1 << function)) == 0) 5258 goto fail; 5259 5260 switch (function) { 5261 case 0: 5262 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5263 goto fail; 5264 break; 5265 default: 5266 goto fail; 5267 } 5268 return kvm_skip_emulated_instruction(vcpu); 5269 5270 fail: 5271 nested_vmx_vmexit(vcpu, vmx->exit_reason, 5272 vmcs_read32(VM_EXIT_INTR_INFO), 5273 vmcs_readl(EXIT_QUALIFICATION)); 5274 return 1; 5275 } 5276 5277 5278 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5279 struct vmcs12 *vmcs12) 5280 { 5281 unsigned long exit_qualification; 5282 gpa_t bitmap, last_bitmap; 5283 unsigned int port; 5284 int size; 5285 u8 b; 5286 5287 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5288 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5289 5290 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5291 5292 port = exit_qualification >> 16; 5293 size = (exit_qualification & 7) + 1; 5294 5295 last_bitmap = (gpa_t)-1; 5296 b = -1; 5297 5298 while (size > 0) { 5299 if (port < 0x8000) 5300 bitmap = vmcs12->io_bitmap_a; 5301 else if (port < 0x10000) 5302 bitmap = vmcs12->io_bitmap_b; 5303 else 5304 return true; 5305 bitmap += (port & 0x7fff) / 8; 5306 5307 if (last_bitmap != bitmap) 5308 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5309 return true; 5310 if (b & (1 << (port & 7))) 5311 return true; 5312 5313 port++; 5314 size--; 5315 last_bitmap = bitmap; 5316 } 5317 5318 return false; 5319 } 5320 5321 /* 5322 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 5323 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5324 * disinterest in the current event (read or write a specific MSR) by using an 5325 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5326 */ 5327 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5328 struct vmcs12 *vmcs12, u32 exit_reason) 5329 { 5330 u32 msr_index = kvm_rcx_read(vcpu); 5331 gpa_t bitmap; 5332 5333 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5334 return true; 5335 5336 /* 5337 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5338 * for the four combinations of read/write and low/high MSR numbers. 5339 * First we need to figure out which of the four to use: 5340 */ 5341 bitmap = vmcs12->msr_bitmap; 5342 if (exit_reason == EXIT_REASON_MSR_WRITE) 5343 bitmap += 2048; 5344 if (msr_index >= 0xc0000000) { 5345 msr_index -= 0xc0000000; 5346 bitmap += 1024; 5347 } 5348 5349 /* Then read the msr_index'th bit from this bitmap: */ 5350 if (msr_index < 1024*8) { 5351 unsigned char b; 5352 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5353 return true; 5354 return 1 & (b >> (msr_index & 7)); 5355 } else 5356 return true; /* let L1 handle the wrong parameter */ 5357 } 5358 5359 /* 5360 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5361 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5362 * intercept (via guest_host_mask etc.) the current event. 5363 */ 5364 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5365 struct vmcs12 *vmcs12) 5366 { 5367 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5368 int cr = exit_qualification & 15; 5369 int reg; 5370 unsigned long val; 5371 5372 switch ((exit_qualification >> 4) & 3) { 5373 case 0: /* mov to cr */ 5374 reg = (exit_qualification >> 8) & 15; 5375 val = kvm_register_readl(vcpu, reg); 5376 switch (cr) { 5377 case 0: 5378 if (vmcs12->cr0_guest_host_mask & 5379 (val ^ vmcs12->cr0_read_shadow)) 5380 return true; 5381 break; 5382 case 3: 5383 if ((vmcs12->cr3_target_count >= 1 && 5384 vmcs12->cr3_target_value0 == val) || 5385 (vmcs12->cr3_target_count >= 2 && 5386 vmcs12->cr3_target_value1 == val) || 5387 (vmcs12->cr3_target_count >= 3 && 5388 vmcs12->cr3_target_value2 == val) || 5389 (vmcs12->cr3_target_count >= 4 && 5390 vmcs12->cr3_target_value3 == val)) 5391 return false; 5392 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5393 return true; 5394 break; 5395 case 4: 5396 if (vmcs12->cr4_guest_host_mask & 5397 (vmcs12->cr4_read_shadow ^ val)) 5398 return true; 5399 break; 5400 case 8: 5401 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5402 return true; 5403 break; 5404 } 5405 break; 5406 case 2: /* clts */ 5407 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5408 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5409 return true; 5410 break; 5411 case 1: /* mov from cr */ 5412 switch (cr) { 5413 case 3: 5414 if (vmcs12->cpu_based_vm_exec_control & 5415 CPU_BASED_CR3_STORE_EXITING) 5416 return true; 5417 break; 5418 case 8: 5419 if (vmcs12->cpu_based_vm_exec_control & 5420 CPU_BASED_CR8_STORE_EXITING) 5421 return true; 5422 break; 5423 } 5424 break; 5425 case 3: /* lmsw */ 5426 /* 5427 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5428 * cr0. Other attempted changes are ignored, with no exit. 5429 */ 5430 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5431 if (vmcs12->cr0_guest_host_mask & 0xe & 5432 (val ^ vmcs12->cr0_read_shadow)) 5433 return true; 5434 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5435 !(vmcs12->cr0_read_shadow & 0x1) && 5436 (val & 0x1)) 5437 return true; 5438 break; 5439 } 5440 return false; 5441 } 5442 5443 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5444 struct vmcs12 *vmcs12, gpa_t bitmap) 5445 { 5446 u32 vmx_instruction_info; 5447 unsigned long field; 5448 u8 b; 5449 5450 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5451 return true; 5452 5453 /* Decode instruction info and find the field to access */ 5454 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5455 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5456 5457 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5458 if (field >> 15) 5459 return true; 5460 5461 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5462 return true; 5463 5464 return 1 & (b >> (field & 7)); 5465 } 5466 5467 /* 5468 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 5469 * should handle it ourselves in L0 (and then continue L2). Only call this 5470 * when in is_guest_mode (L2). 5471 */ 5472 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) 5473 { 5474 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 5475 struct vcpu_vmx *vmx = to_vmx(vcpu); 5476 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5477 5478 if (vmx->nested.nested_run_pending) 5479 return false; 5480 5481 if (unlikely(vmx->fail)) { 5482 trace_kvm_nested_vmenter_failed( 5483 "hardware VM-instruction error: ", 5484 vmcs_read32(VM_INSTRUCTION_ERROR)); 5485 return true; 5486 } 5487 5488 /* 5489 * The host physical addresses of some pages of guest memory 5490 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 5491 * Page). The CPU may write to these pages via their host 5492 * physical address while L2 is running, bypassing any 5493 * address-translation-based dirty tracking (e.g. EPT write 5494 * protection). 5495 * 5496 * Mark them dirty on every exit from L2 to prevent them from 5497 * getting out of sync with dirty tracking. 5498 */ 5499 nested_mark_vmcs12_pages_dirty(vcpu); 5500 5501 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 5502 vmcs_readl(EXIT_QUALIFICATION), 5503 vmx->idt_vectoring_info, 5504 intr_info, 5505 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 5506 KVM_ISA_VMX); 5507 5508 switch (exit_reason) { 5509 case EXIT_REASON_EXCEPTION_NMI: 5510 if (is_nmi(intr_info)) 5511 return false; 5512 else if (is_page_fault(intr_info)) 5513 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; 5514 else if (is_debug(intr_info) && 5515 vcpu->guest_debug & 5516 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5517 return false; 5518 else if (is_breakpoint(intr_info) && 5519 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5520 return false; 5521 return vmcs12->exception_bitmap & 5522 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5523 case EXIT_REASON_EXTERNAL_INTERRUPT: 5524 return false; 5525 case EXIT_REASON_TRIPLE_FAULT: 5526 return true; 5527 case EXIT_REASON_PENDING_INTERRUPT: 5528 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 5529 case EXIT_REASON_NMI_WINDOW: 5530 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 5531 case EXIT_REASON_TASK_SWITCH: 5532 return true; 5533 case EXIT_REASON_CPUID: 5534 return true; 5535 case EXIT_REASON_HLT: 5536 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5537 case EXIT_REASON_INVD: 5538 return true; 5539 case EXIT_REASON_INVLPG: 5540 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5541 case EXIT_REASON_RDPMC: 5542 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5543 case EXIT_REASON_RDRAND: 5544 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5545 case EXIT_REASON_RDSEED: 5546 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5547 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5548 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5549 case EXIT_REASON_VMREAD: 5550 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5551 vmcs12->vmread_bitmap); 5552 case EXIT_REASON_VMWRITE: 5553 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5554 vmcs12->vmwrite_bitmap); 5555 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5556 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5557 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5558 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5559 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5560 /* 5561 * VMX instructions trap unconditionally. This allows L1 to 5562 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5563 */ 5564 return true; 5565 case EXIT_REASON_CR_ACCESS: 5566 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5567 case EXIT_REASON_DR_ACCESS: 5568 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5569 case EXIT_REASON_IO_INSTRUCTION: 5570 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5571 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5572 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5573 case EXIT_REASON_MSR_READ: 5574 case EXIT_REASON_MSR_WRITE: 5575 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5576 case EXIT_REASON_INVALID_STATE: 5577 return true; 5578 case EXIT_REASON_MWAIT_INSTRUCTION: 5579 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5580 case EXIT_REASON_MONITOR_TRAP_FLAG: 5581 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); 5582 case EXIT_REASON_MONITOR_INSTRUCTION: 5583 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5584 case EXIT_REASON_PAUSE_INSTRUCTION: 5585 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5586 nested_cpu_has2(vmcs12, 5587 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5588 case EXIT_REASON_MCE_DURING_VMENTRY: 5589 return false; 5590 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5591 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5592 case EXIT_REASON_APIC_ACCESS: 5593 case EXIT_REASON_APIC_WRITE: 5594 case EXIT_REASON_EOI_INDUCED: 5595 /* 5596 * The controls for "virtualize APIC accesses," "APIC- 5597 * register virtualization," and "virtual-interrupt 5598 * delivery" only come from vmcs12. 5599 */ 5600 return true; 5601 case EXIT_REASON_EPT_VIOLATION: 5602 /* 5603 * L0 always deals with the EPT violation. If nested EPT is 5604 * used, and the nested mmu code discovers that the address is 5605 * missing in the guest EPT table (EPT12), the EPT violation 5606 * will be injected with nested_ept_inject_page_fault() 5607 */ 5608 return false; 5609 case EXIT_REASON_EPT_MISCONFIG: 5610 /* 5611 * L2 never uses directly L1's EPT, but rather L0's own EPT 5612 * table (shadow on EPT) or a merged EPT table that L0 built 5613 * (EPT on EPT). So any problems with the structure of the 5614 * table is L0's fault. 5615 */ 5616 return false; 5617 case EXIT_REASON_INVPCID: 5618 return 5619 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5620 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5621 case EXIT_REASON_WBINVD: 5622 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5623 case EXIT_REASON_XSETBV: 5624 return true; 5625 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5626 /* 5627 * This should never happen, since it is not possible to 5628 * set XSS to a non-zero value---neither in L1 nor in L2. 5629 * If if it were, XSS would have to be checked against 5630 * the XSS exit bitmap in vmcs12. 5631 */ 5632 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5633 case EXIT_REASON_PREEMPTION_TIMER: 5634 return false; 5635 case EXIT_REASON_PML_FULL: 5636 /* We emulate PML support to L1. */ 5637 return false; 5638 case EXIT_REASON_VMFUNC: 5639 /* VM functions are emulated through L2->L0 vmexits. */ 5640 return false; 5641 case EXIT_REASON_ENCLS: 5642 /* SGX is never exposed to L1 */ 5643 return false; 5644 case EXIT_REASON_UMWAIT: 5645 case EXIT_REASON_TPAUSE: 5646 return nested_cpu_has2(vmcs12, 5647 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 5648 default: 5649 return true; 5650 } 5651 } 5652 5653 5654 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 5655 struct kvm_nested_state __user *user_kvm_nested_state, 5656 u32 user_data_size) 5657 { 5658 struct vcpu_vmx *vmx; 5659 struct vmcs12 *vmcs12; 5660 struct kvm_nested_state kvm_state = { 5661 .flags = 0, 5662 .format = KVM_STATE_NESTED_FORMAT_VMX, 5663 .size = sizeof(kvm_state), 5664 .hdr.vmx.vmxon_pa = -1ull, 5665 .hdr.vmx.vmcs12_pa = -1ull, 5666 }; 5667 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5668 &user_kvm_nested_state->data.vmx[0]; 5669 5670 if (!vcpu) 5671 return kvm_state.size + sizeof(*user_vmx_nested_state); 5672 5673 vmx = to_vmx(vcpu); 5674 vmcs12 = get_vmcs12(vcpu); 5675 5676 if (nested_vmx_allowed(vcpu) && 5677 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 5678 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 5679 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 5680 5681 if (vmx_has_valid_vmcs12(vcpu)) { 5682 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 5683 5684 if (vmx->nested.hv_evmcs) 5685 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 5686 5687 if (is_guest_mode(vcpu) && 5688 nested_cpu_has_shadow_vmcs(vmcs12) && 5689 vmcs12->vmcs_link_pointer != -1ull) 5690 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 5691 } 5692 5693 if (vmx->nested.smm.vmxon) 5694 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 5695 5696 if (vmx->nested.smm.guest_mode) 5697 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 5698 5699 if (is_guest_mode(vcpu)) { 5700 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 5701 5702 if (vmx->nested.nested_run_pending) 5703 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 5704 } 5705 } 5706 5707 if (user_data_size < kvm_state.size) 5708 goto out; 5709 5710 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 5711 return -EFAULT; 5712 5713 if (!vmx_has_valid_vmcs12(vcpu)) 5714 goto out; 5715 5716 /* 5717 * When running L2, the authoritative vmcs12 state is in the 5718 * vmcs02. When running L1, the authoritative vmcs12 state is 5719 * in the shadow or enlightened vmcs linked to vmcs01, unless 5720 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 5721 * vmcs12 state is in the vmcs12 already. 5722 */ 5723 if (is_guest_mode(vcpu)) { 5724 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5725 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5726 } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { 5727 if (vmx->nested.hv_evmcs) 5728 copy_enlightened_to_vmcs12(vmx); 5729 else if (enable_shadow_vmcs) 5730 copy_shadow_to_vmcs12(vmx); 5731 } 5732 5733 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 5734 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 5735 5736 /* 5737 * Copy over the full allocated size of vmcs12 rather than just the size 5738 * of the struct. 5739 */ 5740 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 5741 return -EFAULT; 5742 5743 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5744 vmcs12->vmcs_link_pointer != -1ull) { 5745 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 5746 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 5747 return -EFAULT; 5748 } 5749 5750 out: 5751 return kvm_state.size; 5752 } 5753 5754 /* 5755 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 5756 */ 5757 void vmx_leave_nested(struct kvm_vcpu *vcpu) 5758 { 5759 if (is_guest_mode(vcpu)) { 5760 to_vmx(vcpu)->nested.nested_run_pending = 0; 5761 nested_vmx_vmexit(vcpu, -1, 0, 0); 5762 } 5763 free_nested(vcpu); 5764 } 5765 5766 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 5767 struct kvm_nested_state __user *user_kvm_nested_state, 5768 struct kvm_nested_state *kvm_state) 5769 { 5770 struct vcpu_vmx *vmx = to_vmx(vcpu); 5771 struct vmcs12 *vmcs12; 5772 u32 exit_qual; 5773 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5774 &user_kvm_nested_state->data.vmx[0]; 5775 int ret; 5776 5777 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 5778 return -EINVAL; 5779 5780 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { 5781 if (kvm_state->hdr.vmx.smm.flags) 5782 return -EINVAL; 5783 5784 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) 5785 return -EINVAL; 5786 5787 /* 5788 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 5789 * enable eVMCS capability on vCPU. However, since then 5790 * code was changed such that flag signals vmcs12 should 5791 * be copied into eVMCS in guest memory. 5792 * 5793 * To preserve backwards compatability, allow user 5794 * to set this flag even when there is no VMXON region. 5795 */ 5796 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 5797 return -EINVAL; 5798 } else { 5799 if (!nested_vmx_allowed(vcpu)) 5800 return -EINVAL; 5801 5802 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 5803 return -EINVAL; 5804 } 5805 5806 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5807 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5808 return -EINVAL; 5809 5810 if (kvm_state->hdr.vmx.smm.flags & 5811 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 5812 return -EINVAL; 5813 5814 /* 5815 * SMM temporarily disables VMX, so we cannot be in guest mode, 5816 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 5817 * must be zero. 5818 */ 5819 if (is_smm(vcpu) ? 5820 (kvm_state->flags & 5821 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 5822 : kvm_state->hdr.vmx.smm.flags) 5823 return -EINVAL; 5824 5825 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5826 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 5827 return -EINVAL; 5828 5829 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 5830 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 5831 return -EINVAL; 5832 5833 vmx_leave_nested(vcpu); 5834 5835 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) 5836 return 0; 5837 5838 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 5839 ret = enter_vmx_operation(vcpu); 5840 if (ret) 5841 return ret; 5842 5843 /* Empty 'VMXON' state is permitted */ 5844 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) 5845 return 0; 5846 5847 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { 5848 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 5849 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 5850 return -EINVAL; 5851 5852 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 5853 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 5854 /* 5855 * Sync eVMCS upon entry as we may not have 5856 * HV_X64_MSR_VP_ASSIST_PAGE set up yet. 5857 */ 5858 vmx->nested.need_vmcs12_to_shadow_sync = true; 5859 } else { 5860 return -EINVAL; 5861 } 5862 5863 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 5864 vmx->nested.smm.vmxon = true; 5865 vmx->nested.vmxon = false; 5866 5867 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 5868 vmx->nested.smm.guest_mode = true; 5869 } 5870 5871 vmcs12 = get_vmcs12(vcpu); 5872 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 5873 return -EFAULT; 5874 5875 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 5876 return -EINVAL; 5877 5878 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5879 return 0; 5880 5881 vmx->nested.nested_run_pending = 5882 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 5883 5884 ret = -EINVAL; 5885 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5886 vmcs12->vmcs_link_pointer != -1ull) { 5887 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 5888 5889 if (kvm_state->size < 5890 sizeof(*kvm_state) + 5891 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 5892 goto error_guest_mode; 5893 5894 if (copy_from_user(shadow_vmcs12, 5895 user_vmx_nested_state->shadow_vmcs12, 5896 sizeof(*shadow_vmcs12))) { 5897 ret = -EFAULT; 5898 goto error_guest_mode; 5899 } 5900 5901 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 5902 !shadow_vmcs12->hdr.shadow_vmcs) 5903 goto error_guest_mode; 5904 } 5905 5906 if (nested_vmx_check_controls(vcpu, vmcs12) || 5907 nested_vmx_check_host_state(vcpu, vmcs12) || 5908 nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 5909 goto error_guest_mode; 5910 5911 vmx->nested.dirty_vmcs12 = true; 5912 ret = nested_vmx_enter_non_root_mode(vcpu, false); 5913 if (ret) 5914 goto error_guest_mode; 5915 5916 return 0; 5917 5918 error_guest_mode: 5919 vmx->nested.nested_run_pending = 0; 5920 return ret; 5921 } 5922 5923 void nested_vmx_set_vmcs_shadowing_bitmap(void) 5924 { 5925 if (enable_shadow_vmcs) { 5926 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 5927 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 5928 } 5929 } 5930 5931 /* 5932 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 5933 * returned for the various VMX controls MSRs when nested VMX is enabled. 5934 * The same values should also be used to verify that vmcs12 control fields are 5935 * valid during nested entry from L1 to L2. 5936 * Each of these control msrs has a low and high 32-bit half: A low bit is on 5937 * if the corresponding bit in the (32-bit) control field *must* be on, and a 5938 * bit in the high half is on if the corresponding bit in the control field 5939 * may be on. See also vmx_control_verify(). 5940 */ 5941 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, 5942 bool apicv) 5943 { 5944 /* 5945 * Note that as a general rule, the high half of the MSRs (bits in 5946 * the control fields which may be 1) should be initialized by the 5947 * intersection of the underlying hardware's MSR (i.e., features which 5948 * can be supported) and the list of features we want to expose - 5949 * because they are known to be properly supported in our code. 5950 * Also, usually, the low half of the MSRs (bits which must be 1) can 5951 * be set to 0, meaning that L1 may turn off any of these bits. The 5952 * reason is that if one of these bits is necessary, it will appear 5953 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 5954 * fields of vmcs01 and vmcs02, will turn these bits off - and 5955 * nested_vmx_exit_reflected() will not pass related exits to L1. 5956 * These rules have exceptions below. 5957 */ 5958 5959 /* pin-based controls */ 5960 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 5961 msrs->pinbased_ctls_low, 5962 msrs->pinbased_ctls_high); 5963 msrs->pinbased_ctls_low |= 5964 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5965 msrs->pinbased_ctls_high &= 5966 PIN_BASED_EXT_INTR_MASK | 5967 PIN_BASED_NMI_EXITING | 5968 PIN_BASED_VIRTUAL_NMIS | 5969 (apicv ? PIN_BASED_POSTED_INTR : 0); 5970 msrs->pinbased_ctls_high |= 5971 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 5972 PIN_BASED_VMX_PREEMPTION_TIMER; 5973 5974 /* exit controls */ 5975 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 5976 msrs->exit_ctls_low, 5977 msrs->exit_ctls_high); 5978 msrs->exit_ctls_low = 5979 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 5980 5981 msrs->exit_ctls_high &= 5982 #ifdef CONFIG_X86_64 5983 VM_EXIT_HOST_ADDR_SPACE_SIZE | 5984 #endif 5985 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 5986 msrs->exit_ctls_high |= 5987 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 5988 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 5989 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 5990 5991 /* We support free control of debug control saving. */ 5992 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 5993 5994 /* entry controls */ 5995 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 5996 msrs->entry_ctls_low, 5997 msrs->entry_ctls_high); 5998 msrs->entry_ctls_low = 5999 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6000 msrs->entry_ctls_high &= 6001 #ifdef CONFIG_X86_64 6002 VM_ENTRY_IA32E_MODE | 6003 #endif 6004 VM_ENTRY_LOAD_IA32_PAT; 6005 msrs->entry_ctls_high |= 6006 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 6007 6008 /* We support free control of debug control loading. */ 6009 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6010 6011 /* cpu-based controls */ 6012 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 6013 msrs->procbased_ctls_low, 6014 msrs->procbased_ctls_high); 6015 msrs->procbased_ctls_low = 6016 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6017 msrs->procbased_ctls_high &= 6018 CPU_BASED_VIRTUAL_INTR_PENDING | 6019 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 6020 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6021 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6022 CPU_BASED_CR3_STORE_EXITING | 6023 #ifdef CONFIG_X86_64 6024 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6025 #endif 6026 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6027 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6028 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6029 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6030 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6031 /* 6032 * We can allow some features even when not supported by the 6033 * hardware. For example, L1 can specify an MSR bitmap - and we 6034 * can use it to avoid exits to L1 - even when L0 runs L2 6035 * without MSR bitmaps. 6036 */ 6037 msrs->procbased_ctls_high |= 6038 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6039 CPU_BASED_USE_MSR_BITMAPS; 6040 6041 /* We support free control of CR3 access interception. */ 6042 msrs->procbased_ctls_low &= 6043 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6044 6045 /* 6046 * secondary cpu-based controls. Do not include those that 6047 * depend on CPUID bits, they are added later by vmx_cpuid_update. 6048 */ 6049 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 6050 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 6051 msrs->secondary_ctls_low, 6052 msrs->secondary_ctls_high); 6053 6054 msrs->secondary_ctls_low = 0; 6055 msrs->secondary_ctls_high &= 6056 SECONDARY_EXEC_DESC | 6057 SECONDARY_EXEC_RDTSCP | 6058 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6059 SECONDARY_EXEC_WBINVD_EXITING | 6060 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6061 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6062 SECONDARY_EXEC_RDRAND_EXITING | 6063 SECONDARY_EXEC_ENABLE_INVPCID | 6064 SECONDARY_EXEC_RDSEED_EXITING | 6065 SECONDARY_EXEC_XSAVES; 6066 6067 /* 6068 * We can emulate "VMCS shadowing," even if the hardware 6069 * doesn't support it. 6070 */ 6071 msrs->secondary_ctls_high |= 6072 SECONDARY_EXEC_SHADOW_VMCS; 6073 6074 if (enable_ept) { 6075 /* nested EPT: emulate EPT also to L1 */ 6076 msrs->secondary_ctls_high |= 6077 SECONDARY_EXEC_ENABLE_EPT; 6078 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 6079 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; 6080 if (cpu_has_vmx_ept_execute_only()) 6081 msrs->ept_caps |= 6082 VMX_EPT_EXECUTE_ONLY_BIT; 6083 msrs->ept_caps &= ept_caps; 6084 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6085 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6086 VMX_EPT_1GB_PAGE_BIT; 6087 if (enable_ept_ad_bits) { 6088 msrs->secondary_ctls_high |= 6089 SECONDARY_EXEC_ENABLE_PML; 6090 msrs->ept_caps |= VMX_EPT_AD_BIT; 6091 } 6092 } 6093 6094 if (cpu_has_vmx_vmfunc()) { 6095 msrs->secondary_ctls_high |= 6096 SECONDARY_EXEC_ENABLE_VMFUNC; 6097 /* 6098 * Advertise EPTP switching unconditionally 6099 * since we emulate it 6100 */ 6101 if (enable_ept) 6102 msrs->vmfunc_controls = 6103 VMX_VMFUNC_EPTP_SWITCHING; 6104 } 6105 6106 /* 6107 * Old versions of KVM use the single-context version without 6108 * checking for support, so declare that it is supported even 6109 * though it is treated as global context. The alternative is 6110 * not failing the single-context invvpid, and it is worse. 6111 */ 6112 if (enable_vpid) { 6113 msrs->secondary_ctls_high |= 6114 SECONDARY_EXEC_ENABLE_VPID; 6115 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6116 VMX_VPID_EXTENT_SUPPORTED_MASK; 6117 } 6118 6119 if (enable_unrestricted_guest) 6120 msrs->secondary_ctls_high |= 6121 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6122 6123 if (flexpriority_enabled) 6124 msrs->secondary_ctls_high |= 6125 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6126 6127 /* miscellaneous data */ 6128 rdmsr(MSR_IA32_VMX_MISC, 6129 msrs->misc_low, 6130 msrs->misc_high); 6131 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 6132 msrs->misc_low |= 6133 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6134 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6135 VMX_MISC_ACTIVITY_HLT; 6136 msrs->misc_high = 0; 6137 6138 /* 6139 * This MSR reports some information about VMX support. We 6140 * should return information about the VMX we emulate for the 6141 * guest, and the VMCS structure we give it - not about the 6142 * VMX support of the underlying hardware. 6143 */ 6144 msrs->basic = 6145 VMCS12_REVISION | 6146 VMX_BASIC_TRUE_CTLS | 6147 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 6148 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 6149 6150 if (cpu_has_vmx_basic_inout()) 6151 msrs->basic |= VMX_BASIC_INOUT; 6152 6153 /* 6154 * These MSRs specify bits which the guest must keep fixed on 6155 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6156 * We picked the standard core2 setting. 6157 */ 6158 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6159 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6160 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6161 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6162 6163 /* These MSRs specify bits which the guest must keep fixed off. */ 6164 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6165 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6166 6167 /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 6168 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; 6169 } 6170 6171 void nested_vmx_hardware_unsetup(void) 6172 { 6173 int i; 6174 6175 if (enable_shadow_vmcs) { 6176 for (i = 0; i < VMX_BITMAP_NR; i++) 6177 free_page((unsigned long)vmx_bitmap[i]); 6178 } 6179 } 6180 6181 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6182 { 6183 int i; 6184 6185 if (!cpu_has_vmx_shadow_vmcs()) 6186 enable_shadow_vmcs = 0; 6187 if (enable_shadow_vmcs) { 6188 for (i = 0; i < VMX_BITMAP_NR; i++) { 6189 /* 6190 * The vmx_bitmap is not tied to a VM and so should 6191 * not be charged to a memcg. 6192 */ 6193 vmx_bitmap[i] = (unsigned long *) 6194 __get_free_page(GFP_KERNEL); 6195 if (!vmx_bitmap[i]) { 6196 nested_vmx_hardware_unsetup(); 6197 return -ENOMEM; 6198 } 6199 } 6200 6201 init_vmcs_shadow_fields(); 6202 } 6203 6204 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 6205 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 6206 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 6207 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 6208 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 6209 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 6210 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 6211 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; 6212 exit_handlers[EXIT_REASON_VMON] = handle_vmon; 6213 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 6214 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 6215 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 6216 6217 kvm_x86_ops->check_nested_events = vmx_check_nested_events; 6218 kvm_x86_ops->get_nested_state = vmx_get_nested_state; 6219 kvm_x86_ops->set_nested_state = vmx_set_nested_state; 6220 kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages; 6221 kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; 6222 kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; 6223 6224 return 0; 6225 } 6226