1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/frame.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "trace.h" 14 #include "x86.h" 15 16 static bool __read_mostly enable_shadow_vmcs = 1; 17 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 18 19 static bool __read_mostly nested_early_check = 0; 20 module_param(nested_early_check, bool, S_IRUGO); 21 22 #define CC(consistency_check) \ 23 ({ \ 24 bool failed = (consistency_check); \ 25 if (failed) \ 26 trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ 27 failed; \ 28 }) 29 30 /* 31 * Hyper-V requires all of these, so mark them as supported even though 32 * they are just treated the same as all-context. 33 */ 34 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 35 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 36 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 39 40 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 41 42 enum { 43 VMX_VMREAD_BITMAP, 44 VMX_VMWRITE_BITMAP, 45 VMX_BITMAP_NR 46 }; 47 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 48 49 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 50 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 51 52 struct shadow_vmcs_field { 53 u16 encoding; 54 u16 offset; 55 }; 56 static struct shadow_vmcs_field shadow_read_only_fields[] = { 57 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 58 #include "vmcs_shadow_fields.h" 59 }; 60 static int max_shadow_read_only_fields = 61 ARRAY_SIZE(shadow_read_only_fields); 62 63 static struct shadow_vmcs_field shadow_read_write_fields[] = { 64 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 65 #include "vmcs_shadow_fields.h" 66 }; 67 static int max_shadow_read_write_fields = 68 ARRAY_SIZE(shadow_read_write_fields); 69 70 static void init_vmcs_shadow_fields(void) 71 { 72 int i, j; 73 74 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 75 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 76 77 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 78 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 79 u16 field = entry.encoding; 80 81 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 82 (i + 1 == max_shadow_read_only_fields || 83 shadow_read_only_fields[i + 1].encoding != field + 1)) 84 pr_err("Missing field from shadow_read_only_field %x\n", 85 field + 1); 86 87 clear_bit(field, vmx_vmread_bitmap); 88 if (field & 1) 89 #ifdef CONFIG_X86_64 90 continue; 91 #else 92 entry.offset += sizeof(u32); 93 #endif 94 shadow_read_only_fields[j++] = entry; 95 } 96 max_shadow_read_only_fields = j; 97 98 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 99 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 100 u16 field = entry.encoding; 101 102 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 103 (i + 1 == max_shadow_read_write_fields || 104 shadow_read_write_fields[i + 1].encoding != field + 1)) 105 pr_err("Missing field from shadow_read_write_field %x\n", 106 field + 1); 107 108 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 109 field <= GUEST_TR_AR_BYTES, 110 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 111 112 /* 113 * PML and the preemption timer can be emulated, but the 114 * processor cannot vmwrite to fields that don't exist 115 * on bare metal. 116 */ 117 switch (field) { 118 case GUEST_PML_INDEX: 119 if (!cpu_has_vmx_pml()) 120 continue; 121 break; 122 case VMX_PREEMPTION_TIMER_VALUE: 123 if (!cpu_has_vmx_preemption_timer()) 124 continue; 125 break; 126 case GUEST_INTR_STATUS: 127 if (!cpu_has_vmx_apicv()) 128 continue; 129 break; 130 default: 131 break; 132 } 133 134 clear_bit(field, vmx_vmwrite_bitmap); 135 clear_bit(field, vmx_vmread_bitmap); 136 if (field & 1) 137 #ifdef CONFIG_X86_64 138 continue; 139 #else 140 entry.offset += sizeof(u32); 141 #endif 142 shadow_read_write_fields[j++] = entry; 143 } 144 max_shadow_read_write_fields = j; 145 } 146 147 /* 148 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 149 * set the success or error code of an emulated VMX instruction (as specified 150 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 151 * instruction. 152 */ 153 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 154 { 155 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 156 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 157 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 158 return kvm_skip_emulated_instruction(vcpu); 159 } 160 161 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 162 { 163 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 164 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 165 X86_EFLAGS_SF | X86_EFLAGS_OF)) 166 | X86_EFLAGS_CF); 167 return kvm_skip_emulated_instruction(vcpu); 168 } 169 170 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 171 u32 vm_instruction_error) 172 { 173 struct vcpu_vmx *vmx = to_vmx(vcpu); 174 175 /* 176 * failValid writes the error number to the current VMCS, which 177 * can't be done if there isn't a current VMCS. 178 */ 179 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) 180 return nested_vmx_failInvalid(vcpu); 181 182 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 183 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 184 X86_EFLAGS_SF | X86_EFLAGS_OF)) 185 | X86_EFLAGS_ZF); 186 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 187 /* 188 * We don't need to force a shadow sync because 189 * VM_INSTRUCTION_ERROR is not shadowed 190 */ 191 return kvm_skip_emulated_instruction(vcpu); 192 } 193 194 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 195 { 196 /* TODO: not to reset guest simply here. */ 197 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 198 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 199 } 200 201 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 202 { 203 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 204 vmcs_write64(VMCS_LINK_POINTER, -1ull); 205 vmx->nested.need_vmcs12_to_shadow_sync = false; 206 } 207 208 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 209 { 210 struct vcpu_vmx *vmx = to_vmx(vcpu); 211 212 if (!vmx->nested.hv_evmcs) 213 return; 214 215 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 216 vmx->nested.hv_evmcs_vmptr = -1ull; 217 vmx->nested.hv_evmcs = NULL; 218 } 219 220 /* 221 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 222 * just stops using VMX. 223 */ 224 static void free_nested(struct kvm_vcpu *vcpu) 225 { 226 struct vcpu_vmx *vmx = to_vmx(vcpu); 227 228 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 229 return; 230 231 kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 232 233 vmx->nested.vmxon = false; 234 vmx->nested.smm.vmxon = false; 235 free_vpid(vmx->nested.vpid02); 236 vmx->nested.posted_intr_nv = -1; 237 vmx->nested.current_vmptr = -1ull; 238 if (enable_shadow_vmcs) { 239 vmx_disable_shadow_vmcs(vmx); 240 vmcs_clear(vmx->vmcs01.shadow_vmcs); 241 free_vmcs(vmx->vmcs01.shadow_vmcs); 242 vmx->vmcs01.shadow_vmcs = NULL; 243 } 244 kfree(vmx->nested.cached_vmcs12); 245 vmx->nested.cached_vmcs12 = NULL; 246 kfree(vmx->nested.cached_shadow_vmcs12); 247 vmx->nested.cached_shadow_vmcs12 = NULL; 248 /* Unpin physical memory we referred to in the vmcs02 */ 249 if (vmx->nested.apic_access_page) { 250 kvm_release_page_dirty(vmx->nested.apic_access_page); 251 vmx->nested.apic_access_page = NULL; 252 } 253 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 254 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 255 vmx->nested.pi_desc = NULL; 256 257 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 258 259 nested_release_evmcs(vcpu); 260 261 free_loaded_vmcs(&vmx->nested.vmcs02); 262 } 263 264 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 265 struct loaded_vmcs *prev) 266 { 267 struct vmcs_host_state *dest, *src; 268 269 if (unlikely(!vmx->guest_state_loaded)) 270 return; 271 272 src = &prev->host_state; 273 dest = &vmx->loaded_vmcs->host_state; 274 275 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 276 dest->ldt_sel = src->ldt_sel; 277 #ifdef CONFIG_X86_64 278 dest->ds_sel = src->ds_sel; 279 dest->es_sel = src->es_sel; 280 #endif 281 } 282 283 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 284 { 285 struct vcpu_vmx *vmx = to_vmx(vcpu); 286 struct loaded_vmcs *prev; 287 int cpu; 288 289 if (vmx->loaded_vmcs == vmcs) 290 return; 291 292 cpu = get_cpu(); 293 prev = vmx->loaded_vmcs; 294 vmx->loaded_vmcs = vmcs; 295 vmx_vcpu_load_vmcs(vcpu, cpu); 296 vmx_sync_vmcs_host_state(vmx, prev); 297 put_cpu(); 298 299 vmx_segment_cache_clear(vmx); 300 } 301 302 /* 303 * Ensure that the current vmcs of the logical processor is the 304 * vmcs01 of the vcpu before calling free_nested(). 305 */ 306 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 307 { 308 vcpu_load(vcpu); 309 vmx_leave_nested(vcpu); 310 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); 311 free_nested(vcpu); 312 vcpu_put(vcpu); 313 } 314 315 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 316 struct x86_exception *fault) 317 { 318 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 319 struct vcpu_vmx *vmx = to_vmx(vcpu); 320 u32 exit_reason; 321 unsigned long exit_qualification = vcpu->arch.exit_qualification; 322 323 if (vmx->nested.pml_full) { 324 exit_reason = EXIT_REASON_PML_FULL; 325 vmx->nested.pml_full = false; 326 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 327 } else if (fault->error_code & PFERR_RSVD_MASK) 328 exit_reason = EXIT_REASON_EPT_MISCONFIG; 329 else 330 exit_reason = EXIT_REASON_EPT_VIOLATION; 331 332 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); 333 vmcs12->guest_physical_address = fault->address; 334 } 335 336 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 337 { 338 WARN_ON(mmu_is_nested(vcpu)); 339 340 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 341 kvm_init_shadow_ept_mmu(vcpu, 342 to_vmx(vcpu)->nested.msrs.ept_caps & 343 VMX_EPT_EXECUTE_ONLY_BIT, 344 nested_ept_ad_enabled(vcpu), 345 nested_ept_get_cr3(vcpu)); 346 vcpu->arch.mmu->set_cr3 = vmx_set_cr3; 347 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; 348 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 349 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 350 351 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 352 } 353 354 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 355 { 356 vcpu->arch.mmu = &vcpu->arch.root_mmu; 357 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 358 } 359 360 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 361 u16 error_code) 362 { 363 bool inequality, bit; 364 365 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 366 inequality = 367 (error_code & vmcs12->page_fault_error_code_mask) != 368 vmcs12->page_fault_error_code_match; 369 return inequality ^ bit; 370 } 371 372 373 /* 374 * KVM wants to inject page-faults which it got to the guest. This function 375 * checks whether in a nested guest, we need to inject them to L1 or L2. 376 */ 377 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 378 { 379 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 380 unsigned int nr = vcpu->arch.exception.nr; 381 bool has_payload = vcpu->arch.exception.has_payload; 382 unsigned long payload = vcpu->arch.exception.payload; 383 384 if (nr == PF_VECTOR) { 385 if (vcpu->arch.exception.nested_apf) { 386 *exit_qual = vcpu->arch.apf.nested_apf_token; 387 return 1; 388 } 389 if (nested_vmx_is_page_fault_vmexit(vmcs12, 390 vcpu->arch.exception.error_code)) { 391 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 392 return 1; 393 } 394 } else if (vmcs12->exception_bitmap & (1u << nr)) { 395 if (nr == DB_VECTOR) { 396 if (!has_payload) { 397 payload = vcpu->arch.dr6; 398 payload &= ~(DR6_FIXED_1 | DR6_BT); 399 payload ^= DR6_RTM; 400 } 401 *exit_qual = payload; 402 } else 403 *exit_qual = 0; 404 return 1; 405 } 406 407 return 0; 408 } 409 410 411 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 412 struct x86_exception *fault) 413 { 414 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 415 416 WARN_ON(!is_guest_mode(vcpu)); 417 418 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 419 !to_vmx(vcpu)->nested.nested_run_pending) { 420 vmcs12->vm_exit_intr_error_code = fault->error_code; 421 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 422 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 423 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 424 fault->address); 425 } else { 426 kvm_inject_page_fault(vcpu, fault); 427 } 428 } 429 430 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) 431 { 432 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); 433 } 434 435 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 436 struct vmcs12 *vmcs12) 437 { 438 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 439 return 0; 440 441 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 442 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 443 return -EINVAL; 444 445 return 0; 446 } 447 448 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 449 struct vmcs12 *vmcs12) 450 { 451 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 452 return 0; 453 454 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 455 return -EINVAL; 456 457 return 0; 458 } 459 460 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 461 struct vmcs12 *vmcs12) 462 { 463 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 464 return 0; 465 466 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 467 return -EINVAL; 468 469 return 0; 470 } 471 472 /* 473 * Check if MSR is intercepted for L01 MSR bitmap. 474 */ 475 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) 476 { 477 unsigned long *msr_bitmap; 478 int f = sizeof(unsigned long); 479 480 if (!cpu_has_vmx_msr_bitmap()) 481 return true; 482 483 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 484 485 if (msr <= 0x1fff) { 486 return !!test_bit(msr, msr_bitmap + 0x800 / f); 487 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 488 msr &= 0x1fff; 489 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 490 } 491 492 return true; 493 } 494 495 /* 496 * If a msr is allowed by L0, we should check whether it is allowed by L1. 497 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 498 */ 499 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 500 unsigned long *msr_bitmap_nested, 501 u32 msr, int type) 502 { 503 int f = sizeof(unsigned long); 504 505 /* 506 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 507 * have the write-low and read-high bitmap offsets the wrong way round. 508 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 509 */ 510 if (msr <= 0x1fff) { 511 if (type & MSR_TYPE_R && 512 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 513 /* read-low */ 514 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 515 516 if (type & MSR_TYPE_W && 517 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 518 /* write-low */ 519 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 520 521 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 522 msr &= 0x1fff; 523 if (type & MSR_TYPE_R && 524 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 525 /* read-high */ 526 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 527 528 if (type & MSR_TYPE_W && 529 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 530 /* write-high */ 531 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 532 533 } 534 } 535 536 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { 537 int msr; 538 539 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 540 unsigned word = msr / BITS_PER_LONG; 541 542 msr_bitmap[word] = ~0; 543 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 544 } 545 } 546 547 /* 548 * Merge L0's and L1's MSR bitmap, return false to indicate that 549 * we do not use the hardware. 550 */ 551 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 552 struct vmcs12 *vmcs12) 553 { 554 int msr; 555 unsigned long *msr_bitmap_l1; 556 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 557 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; 558 559 /* Nothing to do if the MSR bitmap is not in use. */ 560 if (!cpu_has_vmx_msr_bitmap() || 561 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 562 return false; 563 564 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 565 return false; 566 567 msr_bitmap_l1 = (unsigned long *)map->hva; 568 569 /* 570 * To keep the control flow simple, pay eight 8-byte writes (sixteen 571 * 4-byte writes on 32-bit systems) up front to enable intercepts for 572 * the x2APIC MSR range and selectively disable them below. 573 */ 574 enable_x2apic_msr_intercepts(msr_bitmap_l0); 575 576 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 577 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 578 /* 579 * L0 need not intercept reads for MSRs between 0x800 580 * and 0x8ff, it just lets the processor take the value 581 * from the virtual-APIC page; take those 256 bits 582 * directly from the L1 bitmap. 583 */ 584 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 585 unsigned word = msr / BITS_PER_LONG; 586 587 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 588 } 589 } 590 591 nested_vmx_disable_intercept_for_msr( 592 msr_bitmap_l1, msr_bitmap_l0, 593 X2APIC_MSR(APIC_TASKPRI), 594 MSR_TYPE_R | MSR_TYPE_W); 595 596 if (nested_cpu_has_vid(vmcs12)) { 597 nested_vmx_disable_intercept_for_msr( 598 msr_bitmap_l1, msr_bitmap_l0, 599 X2APIC_MSR(APIC_EOI), 600 MSR_TYPE_W); 601 nested_vmx_disable_intercept_for_msr( 602 msr_bitmap_l1, msr_bitmap_l0, 603 X2APIC_MSR(APIC_SELF_IPI), 604 MSR_TYPE_W); 605 } 606 } 607 608 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ 609 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 610 MSR_FS_BASE, MSR_TYPE_RW); 611 612 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 613 MSR_GS_BASE, MSR_TYPE_RW); 614 615 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 616 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 617 618 /* 619 * Checking the L0->L1 bitmap is trying to verify two things: 620 * 621 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This 622 * ensures that we do not accidentally generate an L02 MSR bitmap 623 * from the L12 MSR bitmap that is too permissive. 624 * 2. That L1 or L2s have actually used the MSR. This avoids 625 * unnecessarily merging of the bitmap if the MSR is unused. This 626 * works properly because we only update the L01 MSR bitmap lazily. 627 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only 628 * updated to reflect this when L1 (or its L2s) actually write to 629 * the MSR. 630 */ 631 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) 632 nested_vmx_disable_intercept_for_msr( 633 msr_bitmap_l1, msr_bitmap_l0, 634 MSR_IA32_SPEC_CTRL, 635 MSR_TYPE_R | MSR_TYPE_W); 636 637 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) 638 nested_vmx_disable_intercept_for_msr( 639 msr_bitmap_l1, msr_bitmap_l0, 640 MSR_IA32_PRED_CMD, 641 MSR_TYPE_W); 642 643 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); 644 645 return true; 646 } 647 648 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 649 struct vmcs12 *vmcs12) 650 { 651 struct kvm_host_map map; 652 struct vmcs12 *shadow; 653 654 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 655 vmcs12->vmcs_link_pointer == -1ull) 656 return; 657 658 shadow = get_shadow_vmcs12(vcpu); 659 660 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 661 return; 662 663 memcpy(shadow, map.hva, VMCS12_SIZE); 664 kvm_vcpu_unmap(vcpu, &map, false); 665 } 666 667 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 668 struct vmcs12 *vmcs12) 669 { 670 struct vcpu_vmx *vmx = to_vmx(vcpu); 671 672 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 673 vmcs12->vmcs_link_pointer == -1ull) 674 return; 675 676 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 677 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 678 } 679 680 /* 681 * In nested virtualization, check if L1 has set 682 * VM_EXIT_ACK_INTR_ON_EXIT 683 */ 684 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 685 { 686 return get_vmcs12(vcpu)->vm_exit_controls & 687 VM_EXIT_ACK_INTR_ON_EXIT; 688 } 689 690 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 691 { 692 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); 693 } 694 695 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 696 struct vmcs12 *vmcs12) 697 { 698 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 699 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 700 return -EINVAL; 701 else 702 return 0; 703 } 704 705 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 706 struct vmcs12 *vmcs12) 707 { 708 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 709 !nested_cpu_has_apic_reg_virt(vmcs12) && 710 !nested_cpu_has_vid(vmcs12) && 711 !nested_cpu_has_posted_intr(vmcs12)) 712 return 0; 713 714 /* 715 * If virtualize x2apic mode is enabled, 716 * virtualize apic access must be disabled. 717 */ 718 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 719 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 720 return -EINVAL; 721 722 /* 723 * If virtual interrupt delivery is enabled, 724 * we must exit on external interrupts. 725 */ 726 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 727 return -EINVAL; 728 729 /* 730 * bits 15:8 should be zero in posted_intr_nv, 731 * the descriptor address has been already checked 732 * in nested_get_vmcs12_pages. 733 * 734 * bits 5:0 of posted_intr_desc_addr should be zero. 735 */ 736 if (nested_cpu_has_posted_intr(vmcs12) && 737 (CC(!nested_cpu_has_vid(vmcs12)) || 738 CC(!nested_exit_intr_ack_set(vcpu)) || 739 CC((vmcs12->posted_intr_nv & 0xff00)) || 740 CC((vmcs12->posted_intr_desc_addr & 0x3f)) || 741 CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) 742 return -EINVAL; 743 744 /* tpr shadow is needed by all apicv features. */ 745 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 746 return -EINVAL; 747 748 return 0; 749 } 750 751 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 752 u32 count, u64 addr) 753 { 754 int maxphyaddr; 755 756 if (count == 0) 757 return 0; 758 maxphyaddr = cpuid_maxphyaddr(vcpu); 759 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 760 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) 761 return -EINVAL; 762 763 return 0; 764 } 765 766 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 767 struct vmcs12 *vmcs12) 768 { 769 if (CC(nested_vmx_check_msr_switch(vcpu, 770 vmcs12->vm_exit_msr_load_count, 771 vmcs12->vm_exit_msr_load_addr)) || 772 CC(nested_vmx_check_msr_switch(vcpu, 773 vmcs12->vm_exit_msr_store_count, 774 vmcs12->vm_exit_msr_store_addr))) 775 return -EINVAL; 776 777 return 0; 778 } 779 780 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 781 struct vmcs12 *vmcs12) 782 { 783 if (CC(nested_vmx_check_msr_switch(vcpu, 784 vmcs12->vm_entry_msr_load_count, 785 vmcs12->vm_entry_msr_load_addr))) 786 return -EINVAL; 787 788 return 0; 789 } 790 791 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 792 struct vmcs12 *vmcs12) 793 { 794 if (!nested_cpu_has_pml(vmcs12)) 795 return 0; 796 797 if (CC(!nested_cpu_has_ept(vmcs12)) || 798 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 799 return -EINVAL; 800 801 return 0; 802 } 803 804 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 805 struct vmcs12 *vmcs12) 806 { 807 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 808 !nested_cpu_has_ept(vmcs12))) 809 return -EINVAL; 810 return 0; 811 } 812 813 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 814 struct vmcs12 *vmcs12) 815 { 816 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 817 !nested_cpu_has_ept(vmcs12))) 818 return -EINVAL; 819 return 0; 820 } 821 822 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 823 struct vmcs12 *vmcs12) 824 { 825 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 826 return 0; 827 828 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 829 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 830 return -EINVAL; 831 832 return 0; 833 } 834 835 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 836 struct vmx_msr_entry *e) 837 { 838 /* x2APIC MSR accesses are not allowed */ 839 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 840 return -EINVAL; 841 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 842 CC(e->index == MSR_IA32_UCODE_REV)) 843 return -EINVAL; 844 if (CC(e->reserved != 0)) 845 return -EINVAL; 846 return 0; 847 } 848 849 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 850 struct vmx_msr_entry *e) 851 { 852 if (CC(e->index == MSR_FS_BASE) || 853 CC(e->index == MSR_GS_BASE) || 854 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 855 nested_vmx_msr_check_common(vcpu, e)) 856 return -EINVAL; 857 return 0; 858 } 859 860 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 861 struct vmx_msr_entry *e) 862 { 863 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 864 nested_vmx_msr_check_common(vcpu, e)) 865 return -EINVAL; 866 return 0; 867 } 868 869 /* 870 * Load guest's/host's msr at nested entry/exit. 871 * return 0 for success, entry index for failure. 872 */ 873 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 874 { 875 u32 i; 876 struct vmx_msr_entry e; 877 878 for (i = 0; i < count; i++) { 879 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 880 &e, sizeof(e))) { 881 pr_debug_ratelimited( 882 "%s cannot read MSR entry (%u, 0x%08llx)\n", 883 __func__, i, gpa + i * sizeof(e)); 884 goto fail; 885 } 886 if (nested_vmx_load_msr_check(vcpu, &e)) { 887 pr_debug_ratelimited( 888 "%s check failed (%u, 0x%x, 0x%x)\n", 889 __func__, i, e.index, e.reserved); 890 goto fail; 891 } 892 if (kvm_set_msr(vcpu, e.index, e.value)) { 893 pr_debug_ratelimited( 894 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 895 __func__, i, e.index, e.value); 896 goto fail; 897 } 898 } 899 return 0; 900 fail: 901 return i + 1; 902 } 903 904 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 905 { 906 u64 data; 907 u32 i; 908 struct vmx_msr_entry e; 909 910 for (i = 0; i < count; i++) { 911 if (kvm_vcpu_read_guest(vcpu, 912 gpa + i * sizeof(e), 913 &e, 2 * sizeof(u32))) { 914 pr_debug_ratelimited( 915 "%s cannot read MSR entry (%u, 0x%08llx)\n", 916 __func__, i, gpa + i * sizeof(e)); 917 return -EINVAL; 918 } 919 if (nested_vmx_store_msr_check(vcpu, &e)) { 920 pr_debug_ratelimited( 921 "%s check failed (%u, 0x%x, 0x%x)\n", 922 __func__, i, e.index, e.reserved); 923 return -EINVAL; 924 } 925 if (kvm_get_msr(vcpu, e.index, &data)) { 926 pr_debug_ratelimited( 927 "%s cannot read MSR (%u, 0x%x)\n", 928 __func__, i, e.index); 929 return -EINVAL; 930 } 931 if (kvm_vcpu_write_guest(vcpu, 932 gpa + i * sizeof(e) + 933 offsetof(struct vmx_msr_entry, value), 934 &data, sizeof(data))) { 935 pr_debug_ratelimited( 936 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 937 __func__, i, e.index, data); 938 return -EINVAL; 939 } 940 } 941 return 0; 942 } 943 944 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) 945 { 946 unsigned long invalid_mask; 947 948 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); 949 return (val & invalid_mask) == 0; 950 } 951 952 /* 953 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are 954 * emulating VM entry into a guest with EPT enabled. 955 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 956 * is assigned to entry_failure_code on failure. 957 */ 958 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, 959 u32 *entry_failure_code) 960 { 961 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { 962 if (CC(!nested_cr3_valid(vcpu, cr3))) { 963 *entry_failure_code = ENTRY_FAIL_DEFAULT; 964 return -EINVAL; 965 } 966 967 /* 968 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 969 * must not be dereferenced. 970 */ 971 if (is_pae_paging(vcpu) && !nested_ept) { 972 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 973 *entry_failure_code = ENTRY_FAIL_PDPTE; 974 return -EINVAL; 975 } 976 } 977 } 978 979 if (!nested_ept) 980 kvm_mmu_new_cr3(vcpu, cr3, false); 981 982 vcpu->arch.cr3 = cr3; 983 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 984 985 kvm_init_mmu(vcpu, false); 986 987 return 0; 988 } 989 990 /* 991 * Returns if KVM is able to config CPU to tag TLB entries 992 * populated by L2 differently than TLB entries populated 993 * by L1. 994 * 995 * If L1 uses EPT, then TLB entries are tagged with different EPTP. 996 * 997 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 998 * with different VPID (L1 entries are tagged with vmx->vpid 999 * while L2 entries are tagged with vmx->nested.vpid02). 1000 */ 1001 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1002 { 1003 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1004 1005 return nested_cpu_has_ept(vmcs12) || 1006 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1007 } 1008 1009 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) 1010 { 1011 struct vcpu_vmx *vmx = to_vmx(vcpu); 1012 1013 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; 1014 } 1015 1016 1017 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 1018 { 1019 return fixed_bits_valid(control, low, high); 1020 } 1021 1022 static inline u64 vmx_control_msr(u32 low, u32 high) 1023 { 1024 return low | ((u64)high << 32); 1025 } 1026 1027 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1028 { 1029 superset &= mask; 1030 subset &= mask; 1031 1032 return (superset | subset) == superset; 1033 } 1034 1035 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1036 { 1037 const u64 feature_and_reserved = 1038 /* feature (except bit 48; see below) */ 1039 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1040 /* reserved */ 1041 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1042 u64 vmx_basic = vmx->nested.msrs.basic; 1043 1044 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1045 return -EINVAL; 1046 1047 /* 1048 * KVM does not emulate a version of VMX that constrains physical 1049 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1050 */ 1051 if (data & BIT_ULL(48)) 1052 return -EINVAL; 1053 1054 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1055 vmx_basic_vmcs_revision_id(data)) 1056 return -EINVAL; 1057 1058 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1059 return -EINVAL; 1060 1061 vmx->nested.msrs.basic = data; 1062 return 0; 1063 } 1064 1065 static int 1066 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1067 { 1068 u64 supported; 1069 u32 *lowp, *highp; 1070 1071 switch (msr_index) { 1072 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1073 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1074 highp = &vmx->nested.msrs.pinbased_ctls_high; 1075 break; 1076 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1077 lowp = &vmx->nested.msrs.procbased_ctls_low; 1078 highp = &vmx->nested.msrs.procbased_ctls_high; 1079 break; 1080 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1081 lowp = &vmx->nested.msrs.exit_ctls_low; 1082 highp = &vmx->nested.msrs.exit_ctls_high; 1083 break; 1084 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1085 lowp = &vmx->nested.msrs.entry_ctls_low; 1086 highp = &vmx->nested.msrs.entry_ctls_high; 1087 break; 1088 case MSR_IA32_VMX_PROCBASED_CTLS2: 1089 lowp = &vmx->nested.msrs.secondary_ctls_low; 1090 highp = &vmx->nested.msrs.secondary_ctls_high; 1091 break; 1092 default: 1093 BUG(); 1094 } 1095 1096 supported = vmx_control_msr(*lowp, *highp); 1097 1098 /* Check must-be-1 bits are still 1. */ 1099 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1100 return -EINVAL; 1101 1102 /* Check must-be-0 bits are still 0. */ 1103 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1104 return -EINVAL; 1105 1106 *lowp = data; 1107 *highp = data >> 32; 1108 return 0; 1109 } 1110 1111 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1112 { 1113 const u64 feature_and_reserved_bits = 1114 /* feature */ 1115 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1116 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1117 /* reserved */ 1118 GENMASK_ULL(13, 9) | BIT_ULL(31); 1119 u64 vmx_misc; 1120 1121 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1122 vmx->nested.msrs.misc_high); 1123 1124 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1125 return -EINVAL; 1126 1127 if ((vmx->nested.msrs.pinbased_ctls_high & 1128 PIN_BASED_VMX_PREEMPTION_TIMER) && 1129 vmx_misc_preemption_timer_rate(data) != 1130 vmx_misc_preemption_timer_rate(vmx_misc)) 1131 return -EINVAL; 1132 1133 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1134 return -EINVAL; 1135 1136 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1137 return -EINVAL; 1138 1139 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1140 return -EINVAL; 1141 1142 vmx->nested.msrs.misc_low = data; 1143 vmx->nested.msrs.misc_high = data >> 32; 1144 1145 return 0; 1146 } 1147 1148 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1149 { 1150 u64 vmx_ept_vpid_cap; 1151 1152 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1153 vmx->nested.msrs.vpid_caps); 1154 1155 /* Every bit is either reserved or a feature bit. */ 1156 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1157 return -EINVAL; 1158 1159 vmx->nested.msrs.ept_caps = data; 1160 vmx->nested.msrs.vpid_caps = data >> 32; 1161 return 0; 1162 } 1163 1164 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1165 { 1166 u64 *msr; 1167 1168 switch (msr_index) { 1169 case MSR_IA32_VMX_CR0_FIXED0: 1170 msr = &vmx->nested.msrs.cr0_fixed0; 1171 break; 1172 case MSR_IA32_VMX_CR4_FIXED0: 1173 msr = &vmx->nested.msrs.cr4_fixed0; 1174 break; 1175 default: 1176 BUG(); 1177 } 1178 1179 /* 1180 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1181 * must be 1 in the restored value. 1182 */ 1183 if (!is_bitwise_subset(data, *msr, -1ULL)) 1184 return -EINVAL; 1185 1186 *msr = data; 1187 return 0; 1188 } 1189 1190 /* 1191 * Called when userspace is restoring VMX MSRs. 1192 * 1193 * Returns 0 on success, non-0 otherwise. 1194 */ 1195 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1196 { 1197 struct vcpu_vmx *vmx = to_vmx(vcpu); 1198 1199 /* 1200 * Don't allow changes to the VMX capability MSRs while the vCPU 1201 * is in VMX operation. 1202 */ 1203 if (vmx->nested.vmxon) 1204 return -EBUSY; 1205 1206 switch (msr_index) { 1207 case MSR_IA32_VMX_BASIC: 1208 return vmx_restore_vmx_basic(vmx, data); 1209 case MSR_IA32_VMX_PINBASED_CTLS: 1210 case MSR_IA32_VMX_PROCBASED_CTLS: 1211 case MSR_IA32_VMX_EXIT_CTLS: 1212 case MSR_IA32_VMX_ENTRY_CTLS: 1213 /* 1214 * The "non-true" VMX capability MSRs are generated from the 1215 * "true" MSRs, so we do not support restoring them directly. 1216 * 1217 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1218 * should restore the "true" MSRs with the must-be-1 bits 1219 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1220 * DEFAULT SETTINGS". 1221 */ 1222 return -EINVAL; 1223 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1224 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1225 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1226 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1227 case MSR_IA32_VMX_PROCBASED_CTLS2: 1228 return vmx_restore_control_msr(vmx, msr_index, data); 1229 case MSR_IA32_VMX_MISC: 1230 return vmx_restore_vmx_misc(vmx, data); 1231 case MSR_IA32_VMX_CR0_FIXED0: 1232 case MSR_IA32_VMX_CR4_FIXED0: 1233 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1234 case MSR_IA32_VMX_CR0_FIXED1: 1235 case MSR_IA32_VMX_CR4_FIXED1: 1236 /* 1237 * These MSRs are generated based on the vCPU's CPUID, so we 1238 * do not support restoring them directly. 1239 */ 1240 return -EINVAL; 1241 case MSR_IA32_VMX_EPT_VPID_CAP: 1242 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1243 case MSR_IA32_VMX_VMCS_ENUM: 1244 vmx->nested.msrs.vmcs_enum = data; 1245 return 0; 1246 case MSR_IA32_VMX_VMFUNC: 1247 if (data & ~vmx->nested.msrs.vmfunc_controls) 1248 return -EINVAL; 1249 vmx->nested.msrs.vmfunc_controls = data; 1250 return 0; 1251 default: 1252 /* 1253 * The rest of the VMX capability MSRs do not support restore. 1254 */ 1255 return -EINVAL; 1256 } 1257 } 1258 1259 /* Returns 0 on success, non-0 otherwise. */ 1260 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1261 { 1262 switch (msr_index) { 1263 case MSR_IA32_VMX_BASIC: 1264 *pdata = msrs->basic; 1265 break; 1266 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1267 case MSR_IA32_VMX_PINBASED_CTLS: 1268 *pdata = vmx_control_msr( 1269 msrs->pinbased_ctls_low, 1270 msrs->pinbased_ctls_high); 1271 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1272 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1273 break; 1274 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1275 case MSR_IA32_VMX_PROCBASED_CTLS: 1276 *pdata = vmx_control_msr( 1277 msrs->procbased_ctls_low, 1278 msrs->procbased_ctls_high); 1279 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1280 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1281 break; 1282 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1283 case MSR_IA32_VMX_EXIT_CTLS: 1284 *pdata = vmx_control_msr( 1285 msrs->exit_ctls_low, 1286 msrs->exit_ctls_high); 1287 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1288 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1289 break; 1290 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1291 case MSR_IA32_VMX_ENTRY_CTLS: 1292 *pdata = vmx_control_msr( 1293 msrs->entry_ctls_low, 1294 msrs->entry_ctls_high); 1295 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1296 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1297 break; 1298 case MSR_IA32_VMX_MISC: 1299 *pdata = vmx_control_msr( 1300 msrs->misc_low, 1301 msrs->misc_high); 1302 break; 1303 case MSR_IA32_VMX_CR0_FIXED0: 1304 *pdata = msrs->cr0_fixed0; 1305 break; 1306 case MSR_IA32_VMX_CR0_FIXED1: 1307 *pdata = msrs->cr0_fixed1; 1308 break; 1309 case MSR_IA32_VMX_CR4_FIXED0: 1310 *pdata = msrs->cr4_fixed0; 1311 break; 1312 case MSR_IA32_VMX_CR4_FIXED1: 1313 *pdata = msrs->cr4_fixed1; 1314 break; 1315 case MSR_IA32_VMX_VMCS_ENUM: 1316 *pdata = msrs->vmcs_enum; 1317 break; 1318 case MSR_IA32_VMX_PROCBASED_CTLS2: 1319 *pdata = vmx_control_msr( 1320 msrs->secondary_ctls_low, 1321 msrs->secondary_ctls_high); 1322 break; 1323 case MSR_IA32_VMX_EPT_VPID_CAP: 1324 *pdata = msrs->ept_caps | 1325 ((u64)msrs->vpid_caps << 32); 1326 break; 1327 case MSR_IA32_VMX_VMFUNC: 1328 *pdata = msrs->vmfunc_controls; 1329 break; 1330 default: 1331 return 1; 1332 } 1333 1334 return 0; 1335 } 1336 1337 /* 1338 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1339 * been modified by the L1 guest. Note, "writable" in this context means 1340 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1341 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1342 * VM-exit information fields (which are actually writable if the vCPU is 1343 * configured to support "VMWRITE to any supported field in the VMCS"). 1344 */ 1345 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1346 { 1347 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1348 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1349 struct shadow_vmcs_field field; 1350 unsigned long val; 1351 int i; 1352 1353 if (WARN_ON(!shadow_vmcs)) 1354 return; 1355 1356 preempt_disable(); 1357 1358 vmcs_load(shadow_vmcs); 1359 1360 for (i = 0; i < max_shadow_read_write_fields; i++) { 1361 field = shadow_read_write_fields[i]; 1362 val = __vmcs_readl(field.encoding); 1363 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1364 } 1365 1366 vmcs_clear(shadow_vmcs); 1367 vmcs_load(vmx->loaded_vmcs->vmcs); 1368 1369 preempt_enable(); 1370 } 1371 1372 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1373 { 1374 const struct shadow_vmcs_field *fields[] = { 1375 shadow_read_write_fields, 1376 shadow_read_only_fields 1377 }; 1378 const int max_fields[] = { 1379 max_shadow_read_write_fields, 1380 max_shadow_read_only_fields 1381 }; 1382 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1383 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1384 struct shadow_vmcs_field field; 1385 unsigned long val; 1386 int i, q; 1387 1388 if (WARN_ON(!shadow_vmcs)) 1389 return; 1390 1391 vmcs_load(shadow_vmcs); 1392 1393 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1394 for (i = 0; i < max_fields[q]; i++) { 1395 field = fields[q][i]; 1396 val = vmcs12_read_any(vmcs12, field.encoding, 1397 field.offset); 1398 __vmcs_writel(field.encoding, val); 1399 } 1400 } 1401 1402 vmcs_clear(shadow_vmcs); 1403 vmcs_load(vmx->loaded_vmcs->vmcs); 1404 } 1405 1406 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) 1407 { 1408 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1409 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1410 1411 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1412 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1413 vmcs12->guest_rip = evmcs->guest_rip; 1414 1415 if (unlikely(!(evmcs->hv_clean_fields & 1416 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1417 vmcs12->guest_rsp = evmcs->guest_rsp; 1418 vmcs12->guest_rflags = evmcs->guest_rflags; 1419 vmcs12->guest_interruptibility_info = 1420 evmcs->guest_interruptibility_info; 1421 } 1422 1423 if (unlikely(!(evmcs->hv_clean_fields & 1424 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1425 vmcs12->cpu_based_vm_exec_control = 1426 evmcs->cpu_based_vm_exec_control; 1427 } 1428 1429 if (unlikely(!(evmcs->hv_clean_fields & 1430 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1431 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1432 } 1433 1434 if (unlikely(!(evmcs->hv_clean_fields & 1435 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1436 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1437 } 1438 1439 if (unlikely(!(evmcs->hv_clean_fields & 1440 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1441 vmcs12->vm_entry_intr_info_field = 1442 evmcs->vm_entry_intr_info_field; 1443 vmcs12->vm_entry_exception_error_code = 1444 evmcs->vm_entry_exception_error_code; 1445 vmcs12->vm_entry_instruction_len = 1446 evmcs->vm_entry_instruction_len; 1447 } 1448 1449 if (unlikely(!(evmcs->hv_clean_fields & 1450 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1451 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1452 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1453 vmcs12->host_cr0 = evmcs->host_cr0; 1454 vmcs12->host_cr3 = evmcs->host_cr3; 1455 vmcs12->host_cr4 = evmcs->host_cr4; 1456 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1457 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1458 vmcs12->host_rip = evmcs->host_rip; 1459 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1460 vmcs12->host_es_selector = evmcs->host_es_selector; 1461 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1462 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1463 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1464 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1465 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1466 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1467 } 1468 1469 if (unlikely(!(evmcs->hv_clean_fields & 1470 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1471 vmcs12->pin_based_vm_exec_control = 1472 evmcs->pin_based_vm_exec_control; 1473 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1474 vmcs12->secondary_vm_exec_control = 1475 evmcs->secondary_vm_exec_control; 1476 } 1477 1478 if (unlikely(!(evmcs->hv_clean_fields & 1479 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1480 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1481 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1482 } 1483 1484 if (unlikely(!(evmcs->hv_clean_fields & 1485 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1486 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1487 } 1488 1489 if (unlikely(!(evmcs->hv_clean_fields & 1490 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1491 vmcs12->guest_es_base = evmcs->guest_es_base; 1492 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1493 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1494 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1495 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1496 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1497 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1498 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1499 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1500 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1501 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1502 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1503 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1504 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1505 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1506 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1507 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1508 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1509 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1510 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1511 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1512 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1513 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1514 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1515 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1516 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1517 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1518 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1519 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1520 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1521 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1522 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1523 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1524 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1525 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1526 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1527 } 1528 1529 if (unlikely(!(evmcs->hv_clean_fields & 1530 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1531 vmcs12->tsc_offset = evmcs->tsc_offset; 1532 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1533 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1534 } 1535 1536 if (unlikely(!(evmcs->hv_clean_fields & 1537 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1538 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1539 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1540 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1541 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1542 vmcs12->guest_cr0 = evmcs->guest_cr0; 1543 vmcs12->guest_cr3 = evmcs->guest_cr3; 1544 vmcs12->guest_cr4 = evmcs->guest_cr4; 1545 vmcs12->guest_dr7 = evmcs->guest_dr7; 1546 } 1547 1548 if (unlikely(!(evmcs->hv_clean_fields & 1549 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1550 vmcs12->host_fs_base = evmcs->host_fs_base; 1551 vmcs12->host_gs_base = evmcs->host_gs_base; 1552 vmcs12->host_tr_base = evmcs->host_tr_base; 1553 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1554 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1555 vmcs12->host_rsp = evmcs->host_rsp; 1556 } 1557 1558 if (unlikely(!(evmcs->hv_clean_fields & 1559 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1560 vmcs12->ept_pointer = evmcs->ept_pointer; 1561 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1562 } 1563 1564 if (unlikely(!(evmcs->hv_clean_fields & 1565 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1566 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1567 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1568 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1569 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1570 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1571 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1572 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1573 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1574 vmcs12->guest_pending_dbg_exceptions = 1575 evmcs->guest_pending_dbg_exceptions; 1576 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1577 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1578 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1579 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1580 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1581 } 1582 1583 /* 1584 * Not used? 1585 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1586 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1587 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1588 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; 1589 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; 1590 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; 1591 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; 1592 * vmcs12->page_fault_error_code_mask = 1593 * evmcs->page_fault_error_code_mask; 1594 * vmcs12->page_fault_error_code_match = 1595 * evmcs->page_fault_error_code_match; 1596 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1597 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1598 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1599 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1600 */ 1601 1602 /* 1603 * Read only fields: 1604 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1605 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1606 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1607 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1608 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1609 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1610 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1611 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1612 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1613 * vmcs12->exit_qualification = evmcs->exit_qualification; 1614 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1615 * 1616 * Not present in struct vmcs12: 1617 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1618 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1619 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1620 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1621 */ 1622 1623 return 0; 1624 } 1625 1626 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1627 { 1628 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1629 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1630 1631 /* 1632 * Should not be changed by KVM: 1633 * 1634 * evmcs->host_es_selector = vmcs12->host_es_selector; 1635 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1636 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1637 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1638 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1639 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1640 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1641 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1642 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1643 * evmcs->host_cr0 = vmcs12->host_cr0; 1644 * evmcs->host_cr3 = vmcs12->host_cr3; 1645 * evmcs->host_cr4 = vmcs12->host_cr4; 1646 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1647 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1648 * evmcs->host_rip = vmcs12->host_rip; 1649 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1650 * evmcs->host_fs_base = vmcs12->host_fs_base; 1651 * evmcs->host_gs_base = vmcs12->host_gs_base; 1652 * evmcs->host_tr_base = vmcs12->host_tr_base; 1653 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1654 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1655 * evmcs->host_rsp = vmcs12->host_rsp; 1656 * sync_vmcs02_to_vmcs12() doesn't read these: 1657 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1658 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1659 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1660 * evmcs->ept_pointer = vmcs12->ept_pointer; 1661 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1662 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1663 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1664 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1665 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; 1666 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; 1667 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; 1668 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; 1669 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1670 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1671 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1672 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1673 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1674 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1675 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1676 * evmcs->page_fault_error_code_mask = 1677 * vmcs12->page_fault_error_code_mask; 1678 * evmcs->page_fault_error_code_match = 1679 * vmcs12->page_fault_error_code_match; 1680 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1681 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1682 * evmcs->tsc_offset = vmcs12->tsc_offset; 1683 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1684 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1685 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1686 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1687 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1688 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1689 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1690 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1691 * 1692 * Not present in struct vmcs12: 1693 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1694 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1695 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1696 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1697 */ 1698 1699 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1700 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1701 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1702 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1703 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1704 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1705 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1706 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1707 1708 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1709 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1710 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1711 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1712 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1713 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1714 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1715 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1716 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1717 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1718 1719 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1720 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1721 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1722 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1723 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1724 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1725 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1726 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1727 1728 evmcs->guest_es_base = vmcs12->guest_es_base; 1729 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1730 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1731 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1732 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1733 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1734 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1735 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1736 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1737 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1738 1739 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1740 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1741 1742 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1743 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1744 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1745 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1746 1747 evmcs->guest_pending_dbg_exceptions = 1748 vmcs12->guest_pending_dbg_exceptions; 1749 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1750 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1751 1752 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1753 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1754 1755 evmcs->guest_cr0 = vmcs12->guest_cr0; 1756 evmcs->guest_cr3 = vmcs12->guest_cr3; 1757 evmcs->guest_cr4 = vmcs12->guest_cr4; 1758 evmcs->guest_dr7 = vmcs12->guest_dr7; 1759 1760 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1761 1762 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1763 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1764 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1765 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1766 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1767 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1768 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1769 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1770 1771 evmcs->exit_qualification = vmcs12->exit_qualification; 1772 1773 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1774 evmcs->guest_rsp = vmcs12->guest_rsp; 1775 evmcs->guest_rflags = vmcs12->guest_rflags; 1776 1777 evmcs->guest_interruptibility_info = 1778 vmcs12->guest_interruptibility_info; 1779 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1780 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1781 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1782 evmcs->vm_entry_exception_error_code = 1783 vmcs12->vm_entry_exception_error_code; 1784 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1785 1786 evmcs->guest_rip = vmcs12->guest_rip; 1787 1788 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1789 1790 return 0; 1791 } 1792 1793 /* 1794 * This is an equivalent of the nested hypervisor executing the vmptrld 1795 * instruction. 1796 */ 1797 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, 1798 bool from_launch) 1799 { 1800 struct vcpu_vmx *vmx = to_vmx(vcpu); 1801 bool evmcs_gpa_changed = false; 1802 u64 evmcs_gpa; 1803 1804 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1805 return 1; 1806 1807 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) 1808 return 1; 1809 1810 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1811 if (!vmx->nested.hv_evmcs) 1812 vmx->nested.current_vmptr = -1ull; 1813 1814 nested_release_evmcs(vcpu); 1815 1816 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1817 &vmx->nested.hv_evmcs_map)) 1818 return 0; 1819 1820 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1821 1822 /* 1823 * Currently, KVM only supports eVMCS version 1 1824 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 1825 * value to first u32 field of eVMCS which should specify eVMCS 1826 * VersionNumber. 1827 * 1828 * Guest should be aware of supported eVMCS versions by host by 1829 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 1830 * expected to set this CPUID leaf according to the value 1831 * returned in vmcs_version from nested_enable_evmcs(). 1832 * 1833 * However, it turns out that Microsoft Hyper-V fails to comply 1834 * to their own invented interface: When Hyper-V use eVMCS, it 1835 * just sets first u32 field of eVMCS to revision_id specified 1836 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 1837 * which is one of the supported versions specified in 1838 * CPUID.0x4000000A.EAX[0:15]. 1839 * 1840 * To overcome Hyper-V bug, we accept here either a supported 1841 * eVMCS version or VMCS12 revision_id as valid values for first 1842 * u32 field of eVMCS. 1843 */ 1844 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 1845 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 1846 nested_release_evmcs(vcpu); 1847 return 0; 1848 } 1849 1850 vmx->nested.dirty_vmcs12 = true; 1851 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 1852 1853 evmcs_gpa_changed = true; 1854 /* 1855 * Unlike normal vmcs12, enlightened vmcs12 is not fully 1856 * reloaded from guest's memory (read only fields, fields not 1857 * present in struct hv_enlightened_vmcs, ...). Make sure there 1858 * are no leftovers. 1859 */ 1860 if (from_launch) { 1861 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1862 memset(vmcs12, 0, sizeof(*vmcs12)); 1863 vmcs12->hdr.revision_id = VMCS12_REVISION; 1864 } 1865 1866 } 1867 1868 /* 1869 * Clean fields data can't de used on VMLAUNCH and when we switch 1870 * between different L2 guests as KVM keeps a single VMCS12 per L1. 1871 */ 1872 if (from_launch || evmcs_gpa_changed) 1873 vmx->nested.hv_evmcs->hv_clean_fields &= 1874 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 1875 1876 return 1; 1877 } 1878 1879 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 1880 { 1881 struct vcpu_vmx *vmx = to_vmx(vcpu); 1882 1883 /* 1884 * hv_evmcs may end up being not mapped after migration (when 1885 * L2 was running), map it here to make sure vmcs12 changes are 1886 * properly reflected. 1887 */ 1888 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) 1889 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 1890 1891 if (vmx->nested.hv_evmcs) { 1892 copy_vmcs12_to_enlightened(vmx); 1893 /* All fields are clean */ 1894 vmx->nested.hv_evmcs->hv_clean_fields |= 1895 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 1896 } else { 1897 copy_vmcs12_to_shadow(vmx); 1898 } 1899 1900 vmx->nested.need_vmcs12_to_shadow_sync = false; 1901 } 1902 1903 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 1904 { 1905 struct vcpu_vmx *vmx = 1906 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 1907 1908 vmx->nested.preemption_timer_expired = true; 1909 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 1910 kvm_vcpu_kick(&vmx->vcpu); 1911 1912 return HRTIMER_NORESTART; 1913 } 1914 1915 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 1916 { 1917 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 1918 struct vcpu_vmx *vmx = to_vmx(vcpu); 1919 1920 /* 1921 * A timer value of zero is architecturally guaranteed to cause 1922 * a VMExit prior to executing any instructions in the guest. 1923 */ 1924 if (preemption_timeout == 0) { 1925 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 1926 return; 1927 } 1928 1929 if (vcpu->arch.virtual_tsc_khz == 0) 1930 return; 1931 1932 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 1933 preemption_timeout *= 1000000; 1934 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 1935 hrtimer_start(&vmx->nested.preemption_timer, 1936 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 1937 } 1938 1939 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 1940 { 1941 if (vmx->nested.nested_run_pending && 1942 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 1943 return vmcs12->guest_ia32_efer; 1944 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 1945 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 1946 else 1947 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 1948 } 1949 1950 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 1951 { 1952 /* 1953 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 1954 * according to L0's settings (vmcs12 is irrelevant here). Host 1955 * fields that come from L0 and are not constant, e.g. HOST_CR3, 1956 * will be set as needed prior to VMLAUNCH/VMRESUME. 1957 */ 1958 if (vmx->nested.vmcs02_initialized) 1959 return; 1960 vmx->nested.vmcs02_initialized = true; 1961 1962 /* 1963 * We don't care what the EPTP value is we just need to guarantee 1964 * it's valid so we don't get a false positive when doing early 1965 * consistency checks. 1966 */ 1967 if (enable_ept && nested_early_check) 1968 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); 1969 1970 /* All VMFUNCs are currently emulated through L0 vmexits. */ 1971 if (cpu_has_vmx_vmfunc()) 1972 vmcs_write64(VM_FUNCTION_CONTROL, 0); 1973 1974 if (cpu_has_vmx_posted_intr()) 1975 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 1976 1977 if (cpu_has_vmx_msr_bitmap()) 1978 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 1979 1980 /* 1981 * The PML address never changes, so it is constant in vmcs02. 1982 * Conceptually we want to copy the PML index from vmcs01 here, 1983 * and then back to vmcs01 on nested vmexit. But since we flush 1984 * the log and reset GUEST_PML_INDEX on each vmexit, the PML 1985 * index is also effectively constant in vmcs02. 1986 */ 1987 if (enable_pml) { 1988 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 1989 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 1990 } 1991 1992 if (cpu_has_vmx_encls_vmexit()) 1993 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 1994 1995 /* 1996 * Set the MSR load/store lists to match L0's settings. Only the 1997 * addresses are constant (for vmcs02), the counts can change based 1998 * on L2's behavior, e.g. switching to/from long mode. 1999 */ 2000 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 2001 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2002 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2003 2004 vmx_set_constant_host_state(vmx); 2005 } 2006 2007 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2008 struct vmcs12 *vmcs12) 2009 { 2010 prepare_vmcs02_constant_state(vmx); 2011 2012 vmcs_write64(VMCS_LINK_POINTER, -1ull); 2013 2014 if (enable_vpid) { 2015 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2016 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2017 else 2018 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2019 } 2020 } 2021 2022 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2023 { 2024 u32 exec_control, vmcs12_exec_ctrl; 2025 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2026 2027 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) 2028 prepare_vmcs02_early_rare(vmx, vmcs12); 2029 2030 /* 2031 * PIN CONTROLS 2032 */ 2033 exec_control = vmx_pin_based_exec_ctrl(vmx); 2034 exec_control |= (vmcs12->pin_based_vm_exec_control & 2035 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2036 2037 /* Posted interrupts setting is only taken from vmcs12. */ 2038 if (nested_cpu_has_posted_intr(vmcs12)) { 2039 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2040 vmx->nested.pi_pending = false; 2041 } else { 2042 exec_control &= ~PIN_BASED_POSTED_INTR; 2043 } 2044 pin_controls_set(vmx, exec_control); 2045 2046 /* 2047 * EXEC CONTROLS 2048 */ 2049 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2050 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 2051 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 2052 exec_control &= ~CPU_BASED_TPR_SHADOW; 2053 exec_control |= vmcs12->cpu_based_vm_exec_control; 2054 2055 if (exec_control & CPU_BASED_TPR_SHADOW) 2056 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2057 #ifdef CONFIG_X86_64 2058 else 2059 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2060 CPU_BASED_CR8_STORE_EXITING; 2061 #endif 2062 2063 /* 2064 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2065 * for I/O port accesses. 2066 */ 2067 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2068 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2069 2070 /* 2071 * This bit will be computed in nested_get_vmcs12_pages, because 2072 * we do not have access to L1's MSR bitmap yet. For now, keep 2073 * the same bit as before, hoping to avoid multiple VMWRITEs that 2074 * only set/clear this bit. 2075 */ 2076 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2077 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2078 2079 exec_controls_set(vmx, exec_control); 2080 2081 /* 2082 * SECONDARY EXEC CONTROLS 2083 */ 2084 if (cpu_has_secondary_exec_ctrls()) { 2085 exec_control = vmx->secondary_exec_control; 2086 2087 /* Take the following fields only from vmcs12 */ 2088 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2089 SECONDARY_EXEC_ENABLE_INVPCID | 2090 SECONDARY_EXEC_RDTSCP | 2091 SECONDARY_EXEC_XSAVES | 2092 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2093 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2094 SECONDARY_EXEC_ENABLE_VMFUNC); 2095 if (nested_cpu_has(vmcs12, 2096 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { 2097 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & 2098 ~SECONDARY_EXEC_ENABLE_PML; 2099 exec_control |= vmcs12_exec_ctrl; 2100 } 2101 2102 /* VMCS shadowing for L2 is emulated for now */ 2103 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2104 2105 /* 2106 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2107 * will not have to rewrite the controls just for this bit. 2108 */ 2109 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2110 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2111 exec_control |= SECONDARY_EXEC_DESC; 2112 2113 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2114 vmcs_write16(GUEST_INTR_STATUS, 2115 vmcs12->guest_intr_status); 2116 2117 secondary_exec_controls_set(vmx, exec_control); 2118 } 2119 2120 /* 2121 * ENTRY CONTROLS 2122 * 2123 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2124 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2125 * on the related bits (if supported by the CPU) in the hope that 2126 * we can avoid VMWrites during vmx_set_efer(). 2127 */ 2128 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & 2129 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; 2130 if (cpu_has_load_ia32_efer()) { 2131 if (guest_efer & EFER_LMA) 2132 exec_control |= VM_ENTRY_IA32E_MODE; 2133 if (guest_efer != host_efer) 2134 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2135 } 2136 vm_entry_controls_set(vmx, exec_control); 2137 2138 /* 2139 * EXIT CONTROLS 2140 * 2141 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2142 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2143 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2144 */ 2145 exec_control = vmx_vmexit_ctrl(); 2146 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2147 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2148 vm_exit_controls_set(vmx, exec_control); 2149 2150 /* 2151 * Interrupt/Exception Fields 2152 */ 2153 if (vmx->nested.nested_run_pending) { 2154 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2155 vmcs12->vm_entry_intr_info_field); 2156 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2157 vmcs12->vm_entry_exception_error_code); 2158 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2159 vmcs12->vm_entry_instruction_len); 2160 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2161 vmcs12->guest_interruptibility_info); 2162 vmx->loaded_vmcs->nmi_known_unmasked = 2163 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2164 } else { 2165 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2166 } 2167 } 2168 2169 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2170 { 2171 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2172 2173 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2174 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2175 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2176 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2177 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2178 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2179 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2180 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2181 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2182 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2183 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2184 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2185 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2186 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2187 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2188 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2189 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2190 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2191 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2192 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2193 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2194 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2195 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2196 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2197 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2198 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2199 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2200 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2201 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2202 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2203 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2204 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2205 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2206 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2207 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2208 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2209 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2210 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2211 } 2212 2213 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2214 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2215 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2216 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2217 vmcs12->guest_pending_dbg_exceptions); 2218 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2219 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2220 2221 /* 2222 * L1 may access the L2's PDPTR, so save them to construct 2223 * vmcs12 2224 */ 2225 if (enable_ept) { 2226 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2227 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2228 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2229 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2230 } 2231 2232 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2233 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2234 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2235 } 2236 2237 if (nested_cpu_has_xsaves(vmcs12)) 2238 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2239 2240 /* 2241 * Whether page-faults are trapped is determined by a combination of 2242 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 2243 * If enable_ept, L0 doesn't care about page faults and we should 2244 * set all of these to L1's desires. However, if !enable_ept, L0 does 2245 * care about (at least some) page faults, and because it is not easy 2246 * (if at all possible?) to merge L0 and L1's desires, we simply ask 2247 * to exit on each and every L2 page fault. This is done by setting 2248 * MASK=MATCH=0 and (see below) EB.PF=1. 2249 * Note that below we don't need special code to set EB.PF beyond the 2250 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2251 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2252 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2253 */ 2254 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 2255 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 2256 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 2257 enable_ept ? vmcs12->page_fault_error_code_match : 0); 2258 2259 if (cpu_has_vmx_apicv()) { 2260 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2261 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2262 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2263 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2264 } 2265 2266 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2267 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2268 2269 set_cr4_guest_host_mask(vmx); 2270 } 2271 2272 /* 2273 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2274 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2275 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2276 * guest in a way that will both be appropriate to L1's requests, and our 2277 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2278 * function also has additional necessary side-effects, like setting various 2279 * vcpu->arch fields. 2280 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2281 * is assigned to entry_failure_code on failure. 2282 */ 2283 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2284 u32 *entry_failure_code) 2285 { 2286 struct vcpu_vmx *vmx = to_vmx(vcpu); 2287 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2288 bool load_guest_pdptrs_vmcs12 = false; 2289 2290 if (vmx->nested.dirty_vmcs12 || hv_evmcs) { 2291 prepare_vmcs02_rare(vmx, vmcs12); 2292 vmx->nested.dirty_vmcs12 = false; 2293 2294 load_guest_pdptrs_vmcs12 = !hv_evmcs || 2295 !(hv_evmcs->hv_clean_fields & 2296 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2297 } 2298 2299 if (vmx->nested.nested_run_pending && 2300 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2301 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2302 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2303 } else { 2304 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2305 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2306 } 2307 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2308 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2309 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2310 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2311 2312 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2313 * bitwise-or of what L1 wants to trap for L2, and what we want to 2314 * trap. Note that CR0.TS also needs updating - we do this later. 2315 */ 2316 update_exception_bitmap(vcpu); 2317 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2318 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2319 2320 if (vmx->nested.nested_run_pending && 2321 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2322 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2323 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2324 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2325 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2326 } 2327 2328 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2329 2330 if (kvm_has_tsc_control) 2331 decache_tsc_multiplier(vmx); 2332 2333 if (enable_vpid) { 2334 /* 2335 * There is no direct mapping between vpid02 and vpid12, the 2336 * vpid02 is per-vCPU for L0 and reused while the value of 2337 * vpid12 is changed w/ one invvpid during nested vmentry. 2338 * The vpid12 is allocated by L1 for L2, so it will not 2339 * influence global bitmap(for vpid01 and vpid02 allocation) 2340 * even if spawn a lot of nested vCPUs. 2341 */ 2342 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { 2343 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 2344 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 2345 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); 2346 } 2347 } else { 2348 /* 2349 * If L1 use EPT, then L0 needs to execute INVEPT on 2350 * EPTP02 instead of EPTP01. Therefore, delay TLB 2351 * flush until vmcs02->eptp is fully updated by 2352 * KVM_REQ_LOAD_CR3. Note that this assumes 2353 * KVM_REQ_TLB_FLUSH is evaluated after 2354 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). 2355 */ 2356 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2357 } 2358 } 2359 2360 if (nested_cpu_has_ept(vmcs12)) 2361 nested_ept_init_mmu_context(vcpu); 2362 else if (nested_cpu_has2(vmcs12, 2363 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2364 vmx_flush_tlb(vcpu, true); 2365 2366 /* 2367 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2368 * bits which we consider mandatory enabled. 2369 * The CR0_READ_SHADOW is what L2 should have expected to read given 2370 * the specifications by L1; It's not enough to take 2371 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2372 * have more bits than L1 expected. 2373 */ 2374 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2375 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2376 2377 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2378 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2379 2380 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2381 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2382 vmx_set_efer(vcpu, vcpu->arch.efer); 2383 2384 /* 2385 * Guest state is invalid and unrestricted guest is disabled, 2386 * which means L1 attempted VMEntry to L2 with invalid state. 2387 * Fail the VMEntry. 2388 */ 2389 if (vmx->emulation_required) { 2390 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2391 return -EINVAL; 2392 } 2393 2394 /* Shadow page tables on either EPT or shadow page tables. */ 2395 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2396 entry_failure_code)) 2397 return -EINVAL; 2398 2399 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2400 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2401 is_pae_paging(vcpu)) { 2402 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2403 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2404 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2405 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2406 } 2407 2408 if (!enable_ept) 2409 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2410 2411 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2412 kvm_rip_write(vcpu, vmcs12->guest_rip); 2413 return 0; 2414 } 2415 2416 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2417 { 2418 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2419 nested_cpu_has_virtual_nmis(vmcs12))) 2420 return -EINVAL; 2421 2422 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2423 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) 2424 return -EINVAL; 2425 2426 return 0; 2427 } 2428 2429 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) 2430 { 2431 struct vcpu_vmx *vmx = to_vmx(vcpu); 2432 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2433 2434 /* Check for memory type validity */ 2435 switch (address & VMX_EPTP_MT_MASK) { 2436 case VMX_EPTP_MT_UC: 2437 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2438 return false; 2439 break; 2440 case VMX_EPTP_MT_WB: 2441 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2442 return false; 2443 break; 2444 default: 2445 return false; 2446 } 2447 2448 /* only 4 levels page-walk length are valid */ 2449 if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) 2450 return false; 2451 2452 /* Reserved bits should not be set */ 2453 if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) 2454 return false; 2455 2456 /* AD, if set, should be supported */ 2457 if (address & VMX_EPTP_AD_ENABLE_BIT) { 2458 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2459 return false; 2460 } 2461 2462 return true; 2463 } 2464 2465 /* 2466 * Checks related to VM-Execution Control Fields 2467 */ 2468 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2469 struct vmcs12 *vmcs12) 2470 { 2471 struct vcpu_vmx *vmx = to_vmx(vcpu); 2472 2473 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2474 vmx->nested.msrs.pinbased_ctls_low, 2475 vmx->nested.msrs.pinbased_ctls_high)) || 2476 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2477 vmx->nested.msrs.procbased_ctls_low, 2478 vmx->nested.msrs.procbased_ctls_high))) 2479 return -EINVAL; 2480 2481 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2482 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2483 vmx->nested.msrs.secondary_ctls_low, 2484 vmx->nested.msrs.secondary_ctls_high))) 2485 return -EINVAL; 2486 2487 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2488 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2489 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2490 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2491 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2492 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2493 nested_vmx_check_nmi_controls(vmcs12) || 2494 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2495 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2496 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2497 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2498 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2499 return -EINVAL; 2500 2501 if (!nested_cpu_has_preemption_timer(vmcs12) && 2502 nested_cpu_has_save_preemption_timer(vmcs12)) 2503 return -EINVAL; 2504 2505 if (nested_cpu_has_ept(vmcs12) && 2506 CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) 2507 return -EINVAL; 2508 2509 if (nested_cpu_has_vmfunc(vmcs12)) { 2510 if (CC(vmcs12->vm_function_control & 2511 ~vmx->nested.msrs.vmfunc_controls)) 2512 return -EINVAL; 2513 2514 if (nested_cpu_has_eptp_switching(vmcs12)) { 2515 if (CC(!nested_cpu_has_ept(vmcs12)) || 2516 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2517 return -EINVAL; 2518 } 2519 } 2520 2521 return 0; 2522 } 2523 2524 /* 2525 * Checks related to VM-Exit Control Fields 2526 */ 2527 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2528 struct vmcs12 *vmcs12) 2529 { 2530 struct vcpu_vmx *vmx = to_vmx(vcpu); 2531 2532 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2533 vmx->nested.msrs.exit_ctls_low, 2534 vmx->nested.msrs.exit_ctls_high)) || 2535 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2536 return -EINVAL; 2537 2538 return 0; 2539 } 2540 2541 /* 2542 * Checks related to VM-Entry Control Fields 2543 */ 2544 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2545 struct vmcs12 *vmcs12) 2546 { 2547 struct vcpu_vmx *vmx = to_vmx(vcpu); 2548 2549 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2550 vmx->nested.msrs.entry_ctls_low, 2551 vmx->nested.msrs.entry_ctls_high))) 2552 return -EINVAL; 2553 2554 /* 2555 * From the Intel SDM, volume 3: 2556 * Fields relevant to VM-entry event injection must be set properly. 2557 * These fields are the VM-entry interruption-information field, the 2558 * VM-entry exception error code, and the VM-entry instruction length. 2559 */ 2560 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2561 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2562 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2563 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2564 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2565 bool should_have_error_code; 2566 bool urg = nested_cpu_has2(vmcs12, 2567 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2568 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2569 2570 /* VM-entry interruption-info field: interruption type */ 2571 if (CC(intr_type == INTR_TYPE_RESERVED) || 2572 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2573 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2574 return -EINVAL; 2575 2576 /* VM-entry interruption-info field: vector */ 2577 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2578 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2579 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2580 return -EINVAL; 2581 2582 /* VM-entry interruption-info field: deliver error code */ 2583 should_have_error_code = 2584 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2585 x86_exception_has_error_code(vector); 2586 if (CC(has_error_code != should_have_error_code)) 2587 return -EINVAL; 2588 2589 /* VM-entry exception error code */ 2590 if (CC(has_error_code && 2591 vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) 2592 return -EINVAL; 2593 2594 /* VM-entry interruption-info field: reserved bits */ 2595 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2596 return -EINVAL; 2597 2598 /* VM-entry instruction length */ 2599 switch (intr_type) { 2600 case INTR_TYPE_SOFT_EXCEPTION: 2601 case INTR_TYPE_SOFT_INTR: 2602 case INTR_TYPE_PRIV_SW_EXCEPTION: 2603 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2604 CC(vmcs12->vm_entry_instruction_len == 0 && 2605 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2606 return -EINVAL; 2607 } 2608 } 2609 2610 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2611 return -EINVAL; 2612 2613 return 0; 2614 } 2615 2616 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2617 struct vmcs12 *vmcs12) 2618 { 2619 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2620 nested_check_vm_exit_controls(vcpu, vmcs12) || 2621 nested_check_vm_entry_controls(vcpu, vmcs12)) 2622 return -EINVAL; 2623 2624 return 0; 2625 } 2626 2627 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2628 struct vmcs12 *vmcs12) 2629 { 2630 bool ia32e; 2631 2632 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2633 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2634 CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) 2635 return -EINVAL; 2636 2637 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2638 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2639 return -EINVAL; 2640 2641 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2642 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2643 return -EINVAL; 2644 2645 ia32e = (vmcs12->vm_exit_controls & 2646 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; 2647 2648 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2649 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2650 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2651 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2652 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2653 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2654 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2655 CC(vmcs12->host_cs_selector == 0) || 2656 CC(vmcs12->host_tr_selector == 0) || 2657 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2658 return -EINVAL; 2659 2660 #ifdef CONFIG_X86_64 2661 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2662 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2663 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2664 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2665 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu))) 2666 return -EINVAL; 2667 #endif 2668 2669 /* 2670 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2671 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2672 * the values of the LMA and LME bits in the field must each be that of 2673 * the host address-space size VM-exit control. 2674 */ 2675 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2676 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2677 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2678 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2679 return -EINVAL; 2680 } 2681 2682 return 0; 2683 } 2684 2685 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2686 struct vmcs12 *vmcs12) 2687 { 2688 int r = 0; 2689 struct vmcs12 *shadow; 2690 struct kvm_host_map map; 2691 2692 if (vmcs12->vmcs_link_pointer == -1ull) 2693 return 0; 2694 2695 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2696 return -EINVAL; 2697 2698 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2699 return -EINVAL; 2700 2701 shadow = map.hva; 2702 2703 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2704 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2705 r = -EINVAL; 2706 2707 kvm_vcpu_unmap(vcpu, &map, false); 2708 return r; 2709 } 2710 2711 /* 2712 * Checks related to Guest Non-register State 2713 */ 2714 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2715 { 2716 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2717 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) 2718 return -EINVAL; 2719 2720 return 0; 2721 } 2722 2723 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2724 struct vmcs12 *vmcs12, 2725 u32 *exit_qual) 2726 { 2727 bool ia32e; 2728 2729 *exit_qual = ENTRY_FAIL_DEFAULT; 2730 2731 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2732 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2733 return -EINVAL; 2734 2735 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2736 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2737 return -EINVAL; 2738 2739 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2740 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; 2741 return -EINVAL; 2742 } 2743 2744 /* 2745 * If the load IA32_EFER VM-entry control is 1, the following checks 2746 * are performed on the field for the IA32_EFER MSR: 2747 * - Bits reserved in the IA32_EFER MSR must be 0. 2748 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2749 * the IA-32e mode guest VM-exit control. It must also be identical 2750 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2751 * CR0.PG) is 1. 2752 */ 2753 if (to_vmx(vcpu)->nested.nested_run_pending && 2754 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2755 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2756 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2757 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2758 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2759 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2760 return -EINVAL; 2761 } 2762 2763 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2764 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 2765 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 2766 return -EINVAL; 2767 2768 if (nested_check_guest_non_reg_state(vmcs12)) 2769 return -EINVAL; 2770 2771 return 0; 2772 } 2773 2774 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 2775 { 2776 struct vcpu_vmx *vmx = to_vmx(vcpu); 2777 unsigned long cr3, cr4; 2778 bool vm_fail; 2779 2780 if (!nested_early_check) 2781 return 0; 2782 2783 if (vmx->msr_autoload.host.nr) 2784 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 2785 if (vmx->msr_autoload.guest.nr) 2786 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 2787 2788 preempt_disable(); 2789 2790 vmx_prepare_switch_to_guest(vcpu); 2791 2792 /* 2793 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 2794 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 2795 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. 2796 * there is no need to preserve other bits or save/restore the field. 2797 */ 2798 vmcs_writel(GUEST_RFLAGS, 0); 2799 2800 cr3 = __get_current_cr3_fast(); 2801 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 2802 vmcs_writel(HOST_CR3, cr3); 2803 vmx->loaded_vmcs->host_state.cr3 = cr3; 2804 } 2805 2806 cr4 = cr4_read_shadow(); 2807 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 2808 vmcs_writel(HOST_CR4, cr4); 2809 vmx->loaded_vmcs->host_state.cr4 = cr4; 2810 } 2811 2812 asm( 2813 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ 2814 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2815 "je 1f \n\t" 2816 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" 2817 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2818 "1: \n\t" 2819 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ 2820 2821 /* Check if vmlaunch or vmresume is needed */ 2822 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" 2823 2824 /* 2825 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set 2826 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail 2827 * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the 2828 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. 2829 */ 2830 "call vmx_vmenter\n\t" 2831 2832 CC_SET(be) 2833 : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) 2834 : [HOST_RSP]"r"((unsigned long)HOST_RSP), 2835 [loaded_vmcs]"r"(vmx->loaded_vmcs), 2836 [launched]"i"(offsetof(struct loaded_vmcs, launched)), 2837 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), 2838 [wordsize]"i"(sizeof(ulong)) 2839 : "memory" 2840 ); 2841 2842 if (vmx->msr_autoload.host.nr) 2843 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2844 if (vmx->msr_autoload.guest.nr) 2845 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2846 2847 if (vm_fail) { 2848 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 2849 2850 preempt_enable(); 2851 2852 trace_kvm_nested_vmenter_failed( 2853 "early hardware check VM-instruction error: ", error); 2854 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 2855 return 1; 2856 } 2857 2858 /* 2859 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 2860 */ 2861 local_irq_enable(); 2862 if (hw_breakpoint_active()) 2863 set_debugreg(__this_cpu_read(cpu_dr7), 7); 2864 preempt_enable(); 2865 2866 /* 2867 * A non-failing VMEntry means we somehow entered guest mode with 2868 * an illegal RIP, and that's just the tip of the iceberg. There 2869 * is no telling what memory has been modified or what state has 2870 * been exposed to unknown code. Hitting this all but guarantees 2871 * a (very critical) hardware issue. 2872 */ 2873 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 2874 VMX_EXIT_REASONS_FAILED_VMENTRY)); 2875 2876 return 0; 2877 } 2878 2879 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 2880 struct vmcs12 *vmcs12); 2881 2882 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 2883 { 2884 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2885 struct vcpu_vmx *vmx = to_vmx(vcpu); 2886 struct kvm_host_map *map; 2887 struct page *page; 2888 u64 hpa; 2889 2890 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 2891 /* 2892 * Translate L1 physical address to host physical 2893 * address for vmcs02. Keep the page pinned, so this 2894 * physical address remains valid. We keep a reference 2895 * to it so we can release it later. 2896 */ 2897 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 2898 kvm_release_page_dirty(vmx->nested.apic_access_page); 2899 vmx->nested.apic_access_page = NULL; 2900 } 2901 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 2902 /* 2903 * If translation failed, no matter: This feature asks 2904 * to exit when accessing the given address, and if it 2905 * can never be accessed, this feature won't do 2906 * anything anyway. 2907 */ 2908 if (!is_error_page(page)) { 2909 vmx->nested.apic_access_page = page; 2910 hpa = page_to_phys(vmx->nested.apic_access_page); 2911 vmcs_write64(APIC_ACCESS_ADDR, hpa); 2912 } else { 2913 secondary_exec_controls_clearbit(vmx, 2914 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 2915 } 2916 } 2917 2918 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 2919 map = &vmx->nested.virtual_apic_map; 2920 2921 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 2922 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 2923 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 2924 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 2925 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 2926 /* 2927 * The processor will never use the TPR shadow, simply 2928 * clear the bit from the execution control. Such a 2929 * configuration is useless, but it happens in tests. 2930 * For any other configuration, failing the vm entry is 2931 * _not_ what the processor does but it's basically the 2932 * only possibility we have. 2933 */ 2934 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 2935 } else { 2936 /* 2937 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 2938 * force VM-Entry to fail. 2939 */ 2940 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 2941 } 2942 } 2943 2944 if (nested_cpu_has_posted_intr(vmcs12)) { 2945 map = &vmx->nested.pi_desc_map; 2946 2947 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 2948 vmx->nested.pi_desc = 2949 (struct pi_desc *)(((void *)map->hva) + 2950 offset_in_page(vmcs12->posted_intr_desc_addr)); 2951 vmcs_write64(POSTED_INTR_DESC_ADDR, 2952 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 2953 } 2954 } 2955 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 2956 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 2957 else 2958 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 2959 } 2960 2961 /* 2962 * Intel's VMX Instruction Reference specifies a common set of prerequisites 2963 * for running VMX instructions (except VMXON, whose prerequisites are 2964 * slightly different). It also specifies what exception to inject otherwise. 2965 * Note that many of these exceptions have priority over VM exits, so they 2966 * don't have to be checked again here. 2967 */ 2968 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 2969 { 2970 if (!to_vmx(vcpu)->nested.vmxon) { 2971 kvm_queue_exception(vcpu, UD_VECTOR); 2972 return 0; 2973 } 2974 2975 if (vmx_get_cpl(vcpu)) { 2976 kvm_inject_gp(vcpu, 0); 2977 return 0; 2978 } 2979 2980 return 1; 2981 } 2982 2983 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 2984 { 2985 u8 rvi = vmx_get_rvi(); 2986 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 2987 2988 return ((rvi & 0xf0) > (vppr & 0xf0)); 2989 } 2990 2991 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 2992 struct vmcs12 *vmcs12); 2993 2994 /* 2995 * If from_vmentry is false, this is being called from state restore (either RSM 2996 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 2997 + * 2998 + * Returns: 2999 + * 0 - success, i.e. proceed with actual VMEnter 3000 + * 1 - consistency check VMExit 3001 + * -1 - consistency check VMFail 3002 */ 3003 int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) 3004 { 3005 struct vcpu_vmx *vmx = to_vmx(vcpu); 3006 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3007 bool evaluate_pending_interrupts; 3008 u32 exit_reason = EXIT_REASON_INVALID_STATE; 3009 u32 exit_qual; 3010 3011 evaluate_pending_interrupts = exec_controls_get(vmx) & 3012 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); 3013 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3014 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3015 3016 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3017 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3018 if (kvm_mpx_supported() && 3019 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3020 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3021 3022 /* 3023 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3024 * nested early checks are disabled. In the event of a "late" VM-Fail, 3025 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3026 * software model to the pre-VMEntry host state. When EPT is disabled, 3027 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3028 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3029 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3030 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3031 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3032 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3033 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3034 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3035 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3036 * path would need to manually save/restore vmcs01.GUEST_CR3. 3037 */ 3038 if (!enable_ept && !nested_early_check) 3039 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3040 3041 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3042 3043 prepare_vmcs02_early(vmx, vmcs12); 3044 3045 if (from_vmentry) { 3046 nested_get_vmcs12_pages(vcpu); 3047 3048 if (nested_vmx_check_vmentry_hw(vcpu)) { 3049 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3050 return -1; 3051 } 3052 3053 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 3054 goto vmentry_fail_vmexit; 3055 } 3056 3057 enter_guest_mode(vcpu); 3058 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3059 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 3060 3061 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) 3062 goto vmentry_fail_vmexit_guest_mode; 3063 3064 if (from_vmentry) { 3065 exit_reason = EXIT_REASON_MSR_LOAD_FAIL; 3066 exit_qual = nested_vmx_load_msr(vcpu, 3067 vmcs12->vm_entry_msr_load_addr, 3068 vmcs12->vm_entry_msr_load_count); 3069 if (exit_qual) 3070 goto vmentry_fail_vmexit_guest_mode; 3071 } else { 3072 /* 3073 * The MMU is not initialized to point at the right entities yet and 3074 * "get pages" would need to read data from the guest (i.e. we will 3075 * need to perform gpa to hpa translation). Request a call 3076 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3077 * have already been set at vmentry time and should not be reset. 3078 */ 3079 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 3080 } 3081 3082 /* 3083 * If L1 had a pending IRQ/NMI until it executed 3084 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3085 * disallowed (e.g. interrupts disabled), L0 needs to 3086 * evaluate if this pending event should cause an exit from L2 3087 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3088 * intercept EXTERNAL_INTERRUPT). 3089 * 3090 * Usually this would be handled by the processor noticing an 3091 * IRQ/NMI window request, or checking RVI during evaluation of 3092 * pending virtual interrupts. However, this setting was done 3093 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3094 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3095 */ 3096 if (unlikely(evaluate_pending_interrupts)) 3097 kvm_make_request(KVM_REQ_EVENT, vcpu); 3098 3099 /* 3100 * Do not start the preemption timer hrtimer until after we know 3101 * we are successful, so that only nested_vmx_vmexit needs to cancel 3102 * the timer. 3103 */ 3104 vmx->nested.preemption_timer_expired = false; 3105 if (nested_cpu_has_preemption_timer(vmcs12)) 3106 vmx_start_preemption_timer(vcpu); 3107 3108 /* 3109 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3110 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3111 * returned as far as L1 is concerned. It will only return (and set 3112 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3113 */ 3114 return 0; 3115 3116 /* 3117 * A failed consistency check that leads to a VMExit during L1's 3118 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3119 * 26.7 "VM-entry failures during or after loading guest state". 3120 */ 3121 vmentry_fail_vmexit_guest_mode: 3122 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3123 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3124 leave_guest_mode(vcpu); 3125 3126 vmentry_fail_vmexit: 3127 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3128 3129 if (!from_vmentry) 3130 return 1; 3131 3132 load_vmcs12_host_state(vcpu, vmcs12); 3133 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 3134 vmcs12->exit_qualification = exit_qual; 3135 if (enable_shadow_vmcs || vmx->nested.hv_evmcs) 3136 vmx->nested.need_vmcs12_to_shadow_sync = true; 3137 return 1; 3138 } 3139 3140 /* 3141 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3142 * for running an L2 nested guest. 3143 */ 3144 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3145 { 3146 struct vmcs12 *vmcs12; 3147 struct vcpu_vmx *vmx = to_vmx(vcpu); 3148 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3149 int ret; 3150 3151 if (!nested_vmx_check_permission(vcpu)) 3152 return 1; 3153 3154 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) 3155 return 1; 3156 3157 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) 3158 return nested_vmx_failInvalid(vcpu); 3159 3160 vmcs12 = get_vmcs12(vcpu); 3161 3162 /* 3163 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3164 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3165 * rather than RFLAGS.ZF, and no error number is stored to the 3166 * VM-instruction error field. 3167 */ 3168 if (vmcs12->hdr.shadow_vmcs) 3169 return nested_vmx_failInvalid(vcpu); 3170 3171 if (vmx->nested.hv_evmcs) { 3172 copy_enlightened_to_vmcs12(vmx); 3173 /* Enlightened VMCS doesn't have launch state */ 3174 vmcs12->launch_state = !launch; 3175 } else if (enable_shadow_vmcs) { 3176 copy_shadow_to_vmcs12(vmx); 3177 } 3178 3179 /* 3180 * The nested entry process starts with enforcing various prerequisites 3181 * on vmcs12 as required by the Intel SDM, and act appropriately when 3182 * they fail: As the SDM explains, some conditions should cause the 3183 * instruction to fail, while others will cause the instruction to seem 3184 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3185 * To speed up the normal (success) code path, we should avoid checking 3186 * for misconfigurations which will anyway be caught by the processor 3187 * when using the merged vmcs02. 3188 */ 3189 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) 3190 return nested_vmx_failValid(vcpu, 3191 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3192 3193 if (vmcs12->launch_state == launch) 3194 return nested_vmx_failValid(vcpu, 3195 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3196 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3197 3198 if (nested_vmx_check_controls(vcpu, vmcs12)) 3199 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3200 3201 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3202 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3203 3204 /* 3205 * We're finally done with prerequisite checking, and can start with 3206 * the nested entry. 3207 */ 3208 vmx->nested.nested_run_pending = 1; 3209 ret = nested_vmx_enter_non_root_mode(vcpu, true); 3210 vmx->nested.nested_run_pending = !ret; 3211 if (ret > 0) 3212 return 1; 3213 else if (ret) 3214 return nested_vmx_failValid(vcpu, 3215 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3216 3217 /* Hide L1D cache contents from the nested guest. */ 3218 vmx->vcpu.arch.l1tf_flush_l1d = true; 3219 3220 /* 3221 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3222 * also be used as part of restoring nVMX state for 3223 * snapshot restore (migration). 3224 * 3225 * In this flow, it is assumed that vmcs12 cache was 3226 * trasferred as part of captured nVMX state and should 3227 * therefore not be read from guest memory (which may not 3228 * exist on destination host yet). 3229 */ 3230 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3231 3232 /* 3233 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3234 * awakened by event injection or by an NMI-window VM-exit or 3235 * by an interrupt-window VM-exit, halt the vcpu. 3236 */ 3237 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && 3238 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3239 !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && 3240 !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && 3241 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3242 vmx->nested.nested_run_pending = 0; 3243 return kvm_vcpu_halt(vcpu); 3244 } 3245 return 1; 3246 } 3247 3248 /* 3249 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3250 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 3251 * This function returns the new value we should put in vmcs12.guest_cr0. 3252 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3253 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3254 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3255 * didn't trap the bit, because if L1 did, so would L0). 3256 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3257 * been modified by L2, and L1 knows it. So just leave the old value of 3258 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3259 * isn't relevant, because if L0 traps this bit it can set it to anything. 3260 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3261 * changed these bits, and therefore they need to be updated, but L0 3262 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3263 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3264 */ 3265 static inline unsigned long 3266 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3267 { 3268 return 3269 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3270 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3271 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3272 vcpu->arch.cr0_guest_owned_bits)); 3273 } 3274 3275 static inline unsigned long 3276 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3277 { 3278 return 3279 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3280 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3281 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3282 vcpu->arch.cr4_guest_owned_bits)); 3283 } 3284 3285 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3286 struct vmcs12 *vmcs12) 3287 { 3288 u32 idt_vectoring; 3289 unsigned int nr; 3290 3291 if (vcpu->arch.exception.injected) { 3292 nr = vcpu->arch.exception.nr; 3293 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3294 3295 if (kvm_exception_is_soft(nr)) { 3296 vmcs12->vm_exit_instruction_len = 3297 vcpu->arch.event_exit_inst_len; 3298 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3299 } else 3300 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3301 3302 if (vcpu->arch.exception.has_error_code) { 3303 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3304 vmcs12->idt_vectoring_error_code = 3305 vcpu->arch.exception.error_code; 3306 } 3307 3308 vmcs12->idt_vectoring_info_field = idt_vectoring; 3309 } else if (vcpu->arch.nmi_injected) { 3310 vmcs12->idt_vectoring_info_field = 3311 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3312 } else if (vcpu->arch.interrupt.injected) { 3313 nr = vcpu->arch.interrupt.nr; 3314 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3315 3316 if (vcpu->arch.interrupt.soft) { 3317 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3318 vmcs12->vm_entry_instruction_len = 3319 vcpu->arch.event_exit_inst_len; 3320 } else 3321 idt_vectoring |= INTR_TYPE_EXT_INTR; 3322 3323 vmcs12->idt_vectoring_info_field = idt_vectoring; 3324 } 3325 } 3326 3327 3328 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3329 { 3330 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3331 gfn_t gfn; 3332 3333 /* 3334 * Don't need to mark the APIC access page dirty; it is never 3335 * written to by the CPU during APIC virtualization. 3336 */ 3337 3338 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3339 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3340 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3341 } 3342 3343 if (nested_cpu_has_posted_intr(vmcs12)) { 3344 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3345 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3346 } 3347 } 3348 3349 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3350 { 3351 struct vcpu_vmx *vmx = to_vmx(vcpu); 3352 int max_irr; 3353 void *vapic_page; 3354 u16 status; 3355 3356 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) 3357 return; 3358 3359 vmx->nested.pi_pending = false; 3360 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3361 return; 3362 3363 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3364 if (max_irr != 256) { 3365 vapic_page = vmx->nested.virtual_apic_map.hva; 3366 if (!vapic_page) 3367 return; 3368 3369 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3370 vapic_page, &max_irr); 3371 status = vmcs_read16(GUEST_INTR_STATUS); 3372 if ((u8)max_irr > ((u8)status & 0xff)) { 3373 status &= ~0xff; 3374 status |= (u8)max_irr; 3375 vmcs_write16(GUEST_INTR_STATUS, status); 3376 } 3377 } 3378 3379 nested_mark_vmcs12_pages_dirty(vcpu); 3380 } 3381 3382 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3383 unsigned long exit_qual) 3384 { 3385 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3386 unsigned int nr = vcpu->arch.exception.nr; 3387 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3388 3389 if (vcpu->arch.exception.has_error_code) { 3390 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3391 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3392 } 3393 3394 if (kvm_exception_is_soft(nr)) 3395 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3396 else 3397 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3398 3399 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3400 vmx_get_nmi_mask(vcpu)) 3401 intr_info |= INTR_INFO_UNBLOCK_NMI; 3402 3403 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3404 } 3405 3406 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 3407 { 3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3409 unsigned long exit_qual; 3410 bool block_nested_events = 3411 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3412 struct kvm_lapic *apic = vcpu->arch.apic; 3413 3414 if (lapic_in_kernel(vcpu) && 3415 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3416 if (block_nested_events) 3417 return -EBUSY; 3418 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3419 return 0; 3420 } 3421 3422 if (vcpu->arch.exception.pending && 3423 nested_vmx_check_exception(vcpu, &exit_qual)) { 3424 if (block_nested_events) 3425 return -EBUSY; 3426 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3427 return 0; 3428 } 3429 3430 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3431 vmx->nested.preemption_timer_expired) { 3432 if (block_nested_events) 3433 return -EBUSY; 3434 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3435 return 0; 3436 } 3437 3438 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 3439 if (block_nested_events) 3440 return -EBUSY; 3441 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3442 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3443 INTR_INFO_VALID_MASK, 0); 3444 /* 3445 * The NMI-triggered VM exit counts as injection: 3446 * clear this one and block further NMIs. 3447 */ 3448 vcpu->arch.nmi_pending = 0; 3449 vmx_set_nmi_mask(vcpu, true); 3450 return 0; 3451 } 3452 3453 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && 3454 nested_exit_on_intr(vcpu)) { 3455 if (block_nested_events) 3456 return -EBUSY; 3457 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3458 return 0; 3459 } 3460 3461 vmx_complete_nested_posted_interrupt(vcpu); 3462 return 0; 3463 } 3464 3465 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3466 { 3467 ktime_t remaining = 3468 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3469 u64 value; 3470 3471 if (ktime_to_ns(remaining) <= 0) 3472 return 0; 3473 3474 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3475 do_div(value, 1000000); 3476 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3477 } 3478 3479 static bool is_vmcs12_ext_field(unsigned long field) 3480 { 3481 switch (field) { 3482 case GUEST_ES_SELECTOR: 3483 case GUEST_CS_SELECTOR: 3484 case GUEST_SS_SELECTOR: 3485 case GUEST_DS_SELECTOR: 3486 case GUEST_FS_SELECTOR: 3487 case GUEST_GS_SELECTOR: 3488 case GUEST_LDTR_SELECTOR: 3489 case GUEST_TR_SELECTOR: 3490 case GUEST_ES_LIMIT: 3491 case GUEST_CS_LIMIT: 3492 case GUEST_SS_LIMIT: 3493 case GUEST_DS_LIMIT: 3494 case GUEST_FS_LIMIT: 3495 case GUEST_GS_LIMIT: 3496 case GUEST_LDTR_LIMIT: 3497 case GUEST_TR_LIMIT: 3498 case GUEST_GDTR_LIMIT: 3499 case GUEST_IDTR_LIMIT: 3500 case GUEST_ES_AR_BYTES: 3501 case GUEST_DS_AR_BYTES: 3502 case GUEST_FS_AR_BYTES: 3503 case GUEST_GS_AR_BYTES: 3504 case GUEST_LDTR_AR_BYTES: 3505 case GUEST_TR_AR_BYTES: 3506 case GUEST_ES_BASE: 3507 case GUEST_CS_BASE: 3508 case GUEST_SS_BASE: 3509 case GUEST_DS_BASE: 3510 case GUEST_FS_BASE: 3511 case GUEST_GS_BASE: 3512 case GUEST_LDTR_BASE: 3513 case GUEST_TR_BASE: 3514 case GUEST_GDTR_BASE: 3515 case GUEST_IDTR_BASE: 3516 case GUEST_PENDING_DBG_EXCEPTIONS: 3517 case GUEST_BNDCFGS: 3518 return true; 3519 default: 3520 break; 3521 } 3522 3523 return false; 3524 } 3525 3526 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3527 struct vmcs12 *vmcs12) 3528 { 3529 struct vcpu_vmx *vmx = to_vmx(vcpu); 3530 3531 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3532 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3533 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3534 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3535 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3536 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3537 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 3538 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 3539 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 3540 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 3541 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 3542 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 3543 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 3544 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 3545 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 3546 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 3547 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3548 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3549 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3550 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 3551 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 3552 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 3553 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 3554 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 3555 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 3556 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 3557 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 3558 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 3559 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 3560 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 3561 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 3562 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 3563 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 3564 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 3565 vmcs12->guest_pending_dbg_exceptions = 3566 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 3567 if (kvm_mpx_supported()) 3568 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3569 3570 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 3571 } 3572 3573 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3574 struct vmcs12 *vmcs12) 3575 { 3576 struct vcpu_vmx *vmx = to_vmx(vcpu); 3577 int cpu; 3578 3579 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 3580 return; 3581 3582 3583 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 3584 3585 cpu = get_cpu(); 3586 vmx->loaded_vmcs = &vmx->nested.vmcs02; 3587 vmx_vcpu_load(&vmx->vcpu, cpu); 3588 3589 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3590 3591 vmx->loaded_vmcs = &vmx->vmcs01; 3592 vmx_vcpu_load(&vmx->vcpu, cpu); 3593 put_cpu(); 3594 } 3595 3596 /* 3597 * Update the guest state fields of vmcs12 to reflect changes that 3598 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 3599 * VM-entry controls is also updated, since this is really a guest 3600 * state bit.) 3601 */ 3602 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3603 { 3604 struct vcpu_vmx *vmx = to_vmx(vcpu); 3605 3606 if (vmx->nested.hv_evmcs) 3607 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3608 3609 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; 3610 3611 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 3612 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 3613 3614 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 3615 vmcs12->guest_rip = kvm_rip_read(vcpu); 3616 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 3617 3618 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 3619 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 3620 3621 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 3622 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 3623 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 3624 3625 vmcs12->guest_interruptibility_info = 3626 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 3627 3628 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3629 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 3630 else 3631 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 3632 3633 if (nested_cpu_has_preemption_timer(vmcs12) && 3634 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) 3635 vmcs12->vmx_preemption_timer_value = 3636 vmx_get_preemption_timer_value(vcpu); 3637 3638 /* 3639 * In some cases (usually, nested EPT), L2 is allowed to change its 3640 * own CR3 without exiting. If it has changed it, we must keep it. 3641 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 3642 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 3643 * 3644 * Additionally, restore L2's PDPTR to vmcs12. 3645 */ 3646 if (enable_ept) { 3647 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 3648 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3649 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 3650 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 3651 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 3652 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 3653 } 3654 } 3655 3656 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 3657 3658 if (nested_cpu_has_vid(vmcs12)) 3659 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 3660 3661 vmcs12->vm_entry_controls = 3662 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 3663 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 3664 3665 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 3666 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 3667 3668 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 3669 vmcs12->guest_ia32_efer = vcpu->arch.efer; 3670 } 3671 3672 /* 3673 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 3674 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 3675 * and this function updates it to reflect the changes to the guest state while 3676 * L2 was running (and perhaps made some exits which were handled directly by L0 3677 * without going back to L1), and to reflect the exit reason. 3678 * Note that we do not have to copy here all VMCS fields, just those that 3679 * could have changed by the L2 guest or the exit - i.e., the guest-state and 3680 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 3681 * which already writes to vmcs12 directly. 3682 */ 3683 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 3684 u32 exit_reason, u32 exit_intr_info, 3685 unsigned long exit_qualification) 3686 { 3687 /* update exit information fields: */ 3688 vmcs12->vm_exit_reason = exit_reason; 3689 vmcs12->exit_qualification = exit_qualification; 3690 vmcs12->vm_exit_intr_info = exit_intr_info; 3691 3692 vmcs12->idt_vectoring_info_field = 0; 3693 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3694 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 3695 3696 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 3697 vmcs12->launch_state = 1; 3698 3699 /* vm_entry_intr_info_field is cleared on exit. Emulate this 3700 * instead of reading the real value. */ 3701 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 3702 3703 /* 3704 * Transfer the event that L0 or L1 may wanted to inject into 3705 * L2 to IDT_VECTORING_INFO_FIELD. 3706 */ 3707 vmcs12_save_pending_event(vcpu, vmcs12); 3708 3709 /* 3710 * According to spec, there's no need to store the guest's 3711 * MSRs if the exit is due to a VM-entry failure that occurs 3712 * during or after loading the guest state. Since this exit 3713 * does not fall in that category, we need to save the MSRs. 3714 */ 3715 if (nested_vmx_store_msr(vcpu, 3716 vmcs12->vm_exit_msr_store_addr, 3717 vmcs12->vm_exit_msr_store_count)) 3718 nested_vmx_abort(vcpu, 3719 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 3720 } 3721 3722 /* 3723 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 3724 * preserved above and would only end up incorrectly in L1. 3725 */ 3726 vcpu->arch.nmi_injected = false; 3727 kvm_clear_exception_queue(vcpu); 3728 kvm_clear_interrupt_queue(vcpu); 3729 } 3730 3731 /* 3732 * A part of what we need to when the nested L2 guest exits and we want to 3733 * run its L1 parent, is to reset L1's guest state to the host state specified 3734 * in vmcs12. 3735 * This function is to be called not only on normal nested exit, but also on 3736 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 3737 * Failures During or After Loading Guest State"). 3738 * This function should be called when the active VMCS is L1's (vmcs01). 3739 */ 3740 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3741 struct vmcs12 *vmcs12) 3742 { 3743 struct kvm_segment seg; 3744 u32 entry_failure_code; 3745 3746 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 3747 vcpu->arch.efer = vmcs12->host_ia32_efer; 3748 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 3749 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 3750 else 3751 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 3752 vmx_set_efer(vcpu, vcpu->arch.efer); 3753 3754 kvm_rsp_write(vcpu, vmcs12->host_rsp); 3755 kvm_rip_write(vcpu, vmcs12->host_rip); 3756 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 3757 vmx_set_interrupt_shadow(vcpu, 0); 3758 3759 /* 3760 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 3761 * actually changed, because vmx_set_cr0 refers to efer set above. 3762 * 3763 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 3764 * (KVM doesn't change it); 3765 */ 3766 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 3767 vmx_set_cr0(vcpu, vmcs12->host_cr0); 3768 3769 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 3770 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 3771 vmx_set_cr4(vcpu, vmcs12->host_cr4); 3772 3773 nested_ept_uninit_mmu_context(vcpu); 3774 3775 /* 3776 * Only PDPTE load can fail as the value of cr3 was checked on entry and 3777 * couldn't have changed. 3778 */ 3779 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) 3780 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 3781 3782 if (!enable_ept) 3783 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 3784 3785 /* 3786 * If vmcs01 doesn't use VPID, CPU flushes TLB on every 3787 * VMEntry/VMExit. Thus, no need to flush TLB. 3788 * 3789 * If vmcs12 doesn't use VPID, L1 expects TLB to be 3790 * flushed on every VMEntry/VMExit. 3791 * 3792 * Otherwise, we can preserve TLB entries as long as we are 3793 * able to tag L1 TLB entries differently than L2 TLB entries. 3794 * 3795 * If vmcs12 uses EPT, we need to execute this flush on EPTP01 3796 * and therefore we request the TLB flush to happen only after VMCS EPTP 3797 * has been set by KVM_REQ_LOAD_CR3. 3798 */ 3799 if (enable_vpid && 3800 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { 3801 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 3802 } 3803 3804 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 3805 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 3806 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 3807 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 3808 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 3809 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 3810 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 3811 3812 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 3813 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 3814 vmcs_write64(GUEST_BNDCFGS, 0); 3815 3816 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 3817 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 3818 vcpu->arch.pat = vmcs12->host_ia32_pat; 3819 } 3820 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 3821 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 3822 vmcs12->host_ia32_perf_global_ctrl); 3823 3824 /* Set L1 segment info according to Intel SDM 3825 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 3826 seg = (struct kvm_segment) { 3827 .base = 0, 3828 .limit = 0xFFFFFFFF, 3829 .selector = vmcs12->host_cs_selector, 3830 .type = 11, 3831 .present = 1, 3832 .s = 1, 3833 .g = 1 3834 }; 3835 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 3836 seg.l = 1; 3837 else 3838 seg.db = 1; 3839 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 3840 seg = (struct kvm_segment) { 3841 .base = 0, 3842 .limit = 0xFFFFFFFF, 3843 .type = 3, 3844 .present = 1, 3845 .s = 1, 3846 .db = 1, 3847 .g = 1 3848 }; 3849 seg.selector = vmcs12->host_ds_selector; 3850 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 3851 seg.selector = vmcs12->host_es_selector; 3852 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 3853 seg.selector = vmcs12->host_ss_selector; 3854 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 3855 seg.selector = vmcs12->host_fs_selector; 3856 seg.base = vmcs12->host_fs_base; 3857 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 3858 seg.selector = vmcs12->host_gs_selector; 3859 seg.base = vmcs12->host_gs_base; 3860 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 3861 seg = (struct kvm_segment) { 3862 .base = vmcs12->host_tr_base, 3863 .limit = 0x67, 3864 .selector = vmcs12->host_tr_selector, 3865 .type = 11, 3866 .present = 1 3867 }; 3868 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 3869 3870 kvm_set_dr(vcpu, 7, 0x400); 3871 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 3872 3873 if (cpu_has_vmx_msr_bitmap()) 3874 vmx_update_msr_bitmap(vcpu); 3875 3876 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 3877 vmcs12->vm_exit_msr_load_count)) 3878 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 3879 } 3880 3881 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 3882 { 3883 struct shared_msr_entry *efer_msr; 3884 unsigned int i; 3885 3886 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 3887 return vmcs_read64(GUEST_IA32_EFER); 3888 3889 if (cpu_has_load_ia32_efer()) 3890 return host_efer; 3891 3892 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 3893 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 3894 return vmx->msr_autoload.guest.val[i].value; 3895 } 3896 3897 efer_msr = find_msr_entry(vmx, MSR_EFER); 3898 if (efer_msr) 3899 return efer_msr->data; 3900 3901 return host_efer; 3902 } 3903 3904 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 3905 { 3906 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3907 struct vcpu_vmx *vmx = to_vmx(vcpu); 3908 struct vmx_msr_entry g, h; 3909 gpa_t gpa; 3910 u32 i, j; 3911 3912 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 3913 3914 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 3915 /* 3916 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 3917 * as vmcs01.GUEST_DR7 contains a userspace defined value 3918 * and vcpu->arch.dr7 is not squirreled away before the 3919 * nested VMENTER (not worth adding a variable in nested_vmx). 3920 */ 3921 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 3922 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 3923 else 3924 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 3925 } 3926 3927 /* 3928 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 3929 * handle a variety of side effects to KVM's software model. 3930 */ 3931 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 3932 3933 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 3934 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 3935 3936 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 3937 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 3938 3939 nested_ept_uninit_mmu_context(vcpu); 3940 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3941 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 3942 3943 /* 3944 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 3945 * from vmcs01 (if necessary). The PDPTRs are not loaded on 3946 * VMFail, like everything else we just need to ensure our 3947 * software model is up-to-date. 3948 */ 3949 if (enable_ept) 3950 ept_save_pdptrs(vcpu); 3951 3952 kvm_mmu_reset_context(vcpu); 3953 3954 if (cpu_has_vmx_msr_bitmap()) 3955 vmx_update_msr_bitmap(vcpu); 3956 3957 /* 3958 * This nasty bit of open coding is a compromise between blindly 3959 * loading L1's MSRs using the exit load lists (incorrect emulation 3960 * of VMFail), leaving the nested VM's MSRs in the software model 3961 * (incorrect behavior) and snapshotting the modified MSRs (too 3962 * expensive since the lists are unbound by hardware). For each 3963 * MSR that was (prematurely) loaded from the nested VMEntry load 3964 * list, reload it from the exit load list if it exists and differs 3965 * from the guest value. The intent is to stuff host state as 3966 * silently as possible, not to fully process the exit load list. 3967 */ 3968 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 3969 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 3970 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 3971 pr_debug_ratelimited( 3972 "%s read MSR index failed (%u, 0x%08llx)\n", 3973 __func__, i, gpa); 3974 goto vmabort; 3975 } 3976 3977 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 3978 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 3979 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 3980 pr_debug_ratelimited( 3981 "%s read MSR failed (%u, 0x%08llx)\n", 3982 __func__, j, gpa); 3983 goto vmabort; 3984 } 3985 if (h.index != g.index) 3986 continue; 3987 if (h.value == g.value) 3988 break; 3989 3990 if (nested_vmx_load_msr_check(vcpu, &h)) { 3991 pr_debug_ratelimited( 3992 "%s check failed (%u, 0x%x, 0x%x)\n", 3993 __func__, j, h.index, h.reserved); 3994 goto vmabort; 3995 } 3996 3997 if (kvm_set_msr(vcpu, h.index, h.value)) { 3998 pr_debug_ratelimited( 3999 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4000 __func__, j, h.index, h.value); 4001 goto vmabort; 4002 } 4003 } 4004 } 4005 4006 return; 4007 4008 vmabort: 4009 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4010 } 4011 4012 /* 4013 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4014 * and modify vmcs12 to make it see what it would expect to see there if 4015 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4016 */ 4017 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 4018 u32 exit_intr_info, unsigned long exit_qualification) 4019 { 4020 struct vcpu_vmx *vmx = to_vmx(vcpu); 4021 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4022 4023 /* trying to cancel vmlaunch/vmresume is a bug */ 4024 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4025 4026 leave_guest_mode(vcpu); 4027 4028 if (nested_cpu_has_preemption_timer(vmcs12)) 4029 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4030 4031 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 4032 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4033 4034 if (likely(!vmx->fail)) { 4035 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4036 4037 if (exit_reason != -1) 4038 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 4039 exit_qualification); 4040 4041 /* 4042 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4043 * also be used to capture vmcs12 cache as part of 4044 * capturing nVMX state for snapshot (migration). 4045 * 4046 * Otherwise, this flush will dirty guest memory at a 4047 * point it is already assumed by user-space to be 4048 * immutable. 4049 */ 4050 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4051 } else { 4052 /* 4053 * The only expected VM-instruction error is "VM entry with 4054 * invalid control field(s)." Anything else indicates a 4055 * problem with L0. And we should never get here with a 4056 * VMFail of any type if early consistency checks are enabled. 4057 */ 4058 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4059 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4060 WARN_ON_ONCE(nested_early_check); 4061 } 4062 4063 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4064 4065 /* Update any VMCS fields that might have changed while L2 ran */ 4066 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4067 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4068 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4069 4070 if (kvm_has_tsc_control) 4071 decache_tsc_multiplier(vmx); 4072 4073 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4074 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4075 vmx_set_virtual_apic_mode(vcpu); 4076 } else if (!nested_cpu_has_ept(vmcs12) && 4077 nested_cpu_has2(vmcs12, 4078 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 4079 vmx_flush_tlb(vcpu, true); 4080 } 4081 4082 /* Unpin physical memory we referred to in vmcs02 */ 4083 if (vmx->nested.apic_access_page) { 4084 kvm_release_page_dirty(vmx->nested.apic_access_page); 4085 vmx->nested.apic_access_page = NULL; 4086 } 4087 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4088 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4089 vmx->nested.pi_desc = NULL; 4090 4091 /* 4092 * We are now running in L2, mmu_notifier will force to reload the 4093 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. 4094 */ 4095 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4096 4097 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) 4098 vmx->nested.need_vmcs12_to_shadow_sync = true; 4099 4100 /* in case we halted in L2 */ 4101 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4102 4103 if (likely(!vmx->fail)) { 4104 /* 4105 * TODO: SDM says that with acknowledge interrupt on 4106 * exit, bit 31 of the VM-exit interrupt information 4107 * (valid interrupt) is always set to 1 on 4108 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't 4109 * need kvm_cpu_has_interrupt(). See the commit 4110 * message for details. 4111 */ 4112 if (nested_exit_intr_ack_set(vcpu) && 4113 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4114 kvm_cpu_has_interrupt(vcpu)) { 4115 int irq = kvm_cpu_get_interrupt(vcpu); 4116 WARN_ON(irq < 0); 4117 vmcs12->vm_exit_intr_info = irq | 4118 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4119 } 4120 4121 if (exit_reason != -1) 4122 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4123 vmcs12->exit_qualification, 4124 vmcs12->idt_vectoring_info_field, 4125 vmcs12->vm_exit_intr_info, 4126 vmcs12->vm_exit_intr_error_code, 4127 KVM_ISA_VMX); 4128 4129 load_vmcs12_host_state(vcpu, vmcs12); 4130 4131 return; 4132 } 4133 4134 /* 4135 * After an early L2 VM-entry failure, we're now back 4136 * in L1 which thinks it just finished a VMLAUNCH or 4137 * VMRESUME instruction, so we need to set the failure 4138 * flag and the VM-instruction error field of the VMCS 4139 * accordingly, and skip the emulated instruction. 4140 */ 4141 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4142 4143 /* 4144 * Restore L1's host state to KVM's software model. We're here 4145 * because a consistency check was caught by hardware, which 4146 * means some amount of guest state has been propagated to KVM's 4147 * model and needs to be unwound to the host's state. 4148 */ 4149 nested_vmx_restore_host_state(vcpu); 4150 4151 vmx->fail = 0; 4152 } 4153 4154 /* 4155 * Decode the memory-address operand of a vmx instruction, as recorded on an 4156 * exit caused by such an instruction (run by a guest hypervisor). 4157 * On success, returns 0. When the operand is invalid, returns 1 and throws 4158 * #UD or #GP. 4159 */ 4160 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4161 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4162 { 4163 gva_t off; 4164 bool exn; 4165 struct kvm_segment s; 4166 4167 /* 4168 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4169 * Execution", on an exit, vmx_instruction_info holds most of the 4170 * addressing components of the operand. Only the displacement part 4171 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4172 * For how an actual address is calculated from all these components, 4173 * refer to Vol. 1, "Operand Addressing". 4174 */ 4175 int scaling = vmx_instruction_info & 3; 4176 int addr_size = (vmx_instruction_info >> 7) & 7; 4177 bool is_reg = vmx_instruction_info & (1u << 10); 4178 int seg_reg = (vmx_instruction_info >> 15) & 7; 4179 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4180 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4181 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4182 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4183 4184 if (is_reg) { 4185 kvm_queue_exception(vcpu, UD_VECTOR); 4186 return 1; 4187 } 4188 4189 /* Addr = segment_base + offset */ 4190 /* offset = base + [index * scale] + displacement */ 4191 off = exit_qualification; /* holds the displacement */ 4192 if (addr_size == 1) 4193 off = (gva_t)sign_extend64(off, 31); 4194 else if (addr_size == 0) 4195 off = (gva_t)sign_extend64(off, 15); 4196 if (base_is_valid) 4197 off += kvm_register_read(vcpu, base_reg); 4198 if (index_is_valid) 4199 off += kvm_register_read(vcpu, index_reg)<<scaling; 4200 vmx_get_segment(vcpu, &s, seg_reg); 4201 4202 /* 4203 * The effective address, i.e. @off, of a memory operand is truncated 4204 * based on the address size of the instruction. Note that this is 4205 * the *effective address*, i.e. the address prior to accounting for 4206 * the segment's base. 4207 */ 4208 if (addr_size == 1) /* 32 bit */ 4209 off &= 0xffffffff; 4210 else if (addr_size == 0) /* 16 bit */ 4211 off &= 0xffff; 4212 4213 /* Checks for #GP/#SS exceptions. */ 4214 exn = false; 4215 if (is_long_mode(vcpu)) { 4216 /* 4217 * The virtual/linear address is never truncated in 64-bit 4218 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4219 * address when using FS/GS with a non-zero base. 4220 */ 4221 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4222 *ret = s.base + off; 4223 else 4224 *ret = off; 4225 4226 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4227 * non-canonical form. This is the only check on the memory 4228 * destination for long mode! 4229 */ 4230 exn = is_noncanonical_address(*ret, vcpu); 4231 } else { 4232 /* 4233 * When not in long mode, the virtual/linear address is 4234 * unconditionally truncated to 32 bits regardless of the 4235 * address size. 4236 */ 4237 *ret = (s.base + off) & 0xffffffff; 4238 4239 /* Protected mode: apply checks for segment validity in the 4240 * following order: 4241 * - segment type check (#GP(0) may be thrown) 4242 * - usability check (#GP(0)/#SS(0)) 4243 * - limit check (#GP(0)/#SS(0)) 4244 */ 4245 if (wr) 4246 /* #GP(0) if the destination operand is located in a 4247 * read-only data segment or any code segment. 4248 */ 4249 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4250 else 4251 /* #GP(0) if the source operand is located in an 4252 * execute-only code segment 4253 */ 4254 exn = ((s.type & 0xa) == 8); 4255 if (exn) { 4256 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4257 return 1; 4258 } 4259 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4260 */ 4261 exn = (s.unusable != 0); 4262 4263 /* 4264 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4265 * outside the segment limit. All CPUs that support VMX ignore 4266 * limit checks for flat segments, i.e. segments with base==0, 4267 * limit==0xffffffff and of type expand-up data or code. 4268 */ 4269 if (!(s.base == 0 && s.limit == 0xffffffff && 4270 ((s.type & 8) || !(s.type & 4)))) 4271 exn = exn || ((u64)off + len - 1 > s.limit); 4272 } 4273 if (exn) { 4274 kvm_queue_exception_e(vcpu, 4275 seg_reg == VCPU_SREG_SS ? 4276 SS_VECTOR : GP_VECTOR, 4277 0); 4278 return 1; 4279 } 4280 4281 return 0; 4282 } 4283 4284 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) 4285 { 4286 gva_t gva; 4287 struct x86_exception e; 4288 4289 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4290 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4291 sizeof(*vmpointer), &gva)) 4292 return 1; 4293 4294 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { 4295 kvm_inject_page_fault(vcpu, &e); 4296 return 1; 4297 } 4298 4299 return 0; 4300 } 4301 4302 /* 4303 * Allocate a shadow VMCS and associate it with the currently loaded 4304 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4305 * VMCS is also VMCLEARed, so that it is ready for use. 4306 */ 4307 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4308 { 4309 struct vcpu_vmx *vmx = to_vmx(vcpu); 4310 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4311 4312 /* 4313 * We should allocate a shadow vmcs for vmcs01 only when L1 4314 * executes VMXON and free it when L1 executes VMXOFF. 4315 * As it is invalid to execute VMXON twice, we shouldn't reach 4316 * here when vmcs01 already have an allocated shadow vmcs. 4317 */ 4318 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4319 4320 if (!loaded_vmcs->shadow_vmcs) { 4321 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4322 if (loaded_vmcs->shadow_vmcs) 4323 vmcs_clear(loaded_vmcs->shadow_vmcs); 4324 } 4325 return loaded_vmcs->shadow_vmcs; 4326 } 4327 4328 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4329 { 4330 struct vcpu_vmx *vmx = to_vmx(vcpu); 4331 int r; 4332 4333 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4334 if (r < 0) 4335 goto out_vmcs02; 4336 4337 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4338 if (!vmx->nested.cached_vmcs12) 4339 goto out_cached_vmcs12; 4340 4341 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4342 if (!vmx->nested.cached_shadow_vmcs12) 4343 goto out_cached_shadow_vmcs12; 4344 4345 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4346 goto out_shadow_vmcs; 4347 4348 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4349 HRTIMER_MODE_REL_PINNED); 4350 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4351 4352 vmx->nested.vpid02 = allocate_vpid(); 4353 4354 vmx->nested.vmcs02_initialized = false; 4355 vmx->nested.vmxon = true; 4356 4357 if (pt_mode == PT_MODE_HOST_GUEST) { 4358 vmx->pt_desc.guest.ctl = 0; 4359 pt_update_intercept_for_msr(vmx); 4360 } 4361 4362 return 0; 4363 4364 out_shadow_vmcs: 4365 kfree(vmx->nested.cached_shadow_vmcs12); 4366 4367 out_cached_shadow_vmcs12: 4368 kfree(vmx->nested.cached_vmcs12); 4369 4370 out_cached_vmcs12: 4371 free_loaded_vmcs(&vmx->nested.vmcs02); 4372 4373 out_vmcs02: 4374 return -ENOMEM; 4375 } 4376 4377 /* 4378 * Emulate the VMXON instruction. 4379 * Currently, we just remember that VMX is active, and do not save or even 4380 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4381 * do not currently need to store anything in that guest-allocated memory 4382 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4383 * argument is different from the VMXON pointer (which the spec says they do). 4384 */ 4385 static int handle_vmon(struct kvm_vcpu *vcpu) 4386 { 4387 int ret; 4388 gpa_t vmptr; 4389 uint32_t revision; 4390 struct vcpu_vmx *vmx = to_vmx(vcpu); 4391 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 4392 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 4393 4394 /* 4395 * The Intel VMX Instruction Reference lists a bunch of bits that are 4396 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4397 * 1 (see vmx_set_cr4() for when we allow the guest to set this). 4398 * Otherwise, we should fail with #UD. But most faulting conditions 4399 * have already been checked by hardware, prior to the VM-exit for 4400 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4401 * that bit set to 1 in non-root mode. 4402 */ 4403 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4404 kvm_queue_exception(vcpu, UD_VECTOR); 4405 return 1; 4406 } 4407 4408 /* CPL=0 must be checked manually. */ 4409 if (vmx_get_cpl(vcpu)) { 4410 kvm_inject_gp(vcpu, 0); 4411 return 1; 4412 } 4413 4414 if (vmx->nested.vmxon) 4415 return nested_vmx_failValid(vcpu, 4416 VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4417 4418 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4419 != VMXON_NEEDED_FEATURES) { 4420 kvm_inject_gp(vcpu, 0); 4421 return 1; 4422 } 4423 4424 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4425 return 1; 4426 4427 /* 4428 * SDM 3: 24.11.5 4429 * The first 4 bytes of VMXON region contain the supported 4430 * VMCS revision identifier 4431 * 4432 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4433 * which replaces physical address width with 32 4434 */ 4435 if (!page_address_valid(vcpu, vmptr)) 4436 return nested_vmx_failInvalid(vcpu); 4437 4438 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4439 revision != VMCS12_REVISION) 4440 return nested_vmx_failInvalid(vcpu); 4441 4442 vmx->nested.vmxon_ptr = vmptr; 4443 ret = enter_vmx_operation(vcpu); 4444 if (ret) 4445 return ret; 4446 4447 return nested_vmx_succeed(vcpu); 4448 } 4449 4450 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4451 { 4452 struct vcpu_vmx *vmx = to_vmx(vcpu); 4453 4454 if (vmx->nested.current_vmptr == -1ull) 4455 return; 4456 4457 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4458 4459 if (enable_shadow_vmcs) { 4460 /* copy to memory all shadowed fields in case 4461 they were modified */ 4462 copy_shadow_to_vmcs12(vmx); 4463 vmx_disable_shadow_vmcs(vmx); 4464 } 4465 vmx->nested.posted_intr_nv = -1; 4466 4467 /* Flush VMCS12 to guest memory */ 4468 kvm_vcpu_write_guest_page(vcpu, 4469 vmx->nested.current_vmptr >> PAGE_SHIFT, 4470 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4471 4472 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4473 4474 vmx->nested.current_vmptr = -1ull; 4475 } 4476 4477 /* Emulate the VMXOFF instruction */ 4478 static int handle_vmoff(struct kvm_vcpu *vcpu) 4479 { 4480 if (!nested_vmx_check_permission(vcpu)) 4481 return 1; 4482 4483 free_nested(vcpu); 4484 4485 /* Process a latched INIT during time CPU was in VMX operation */ 4486 kvm_make_request(KVM_REQ_EVENT, vcpu); 4487 4488 return nested_vmx_succeed(vcpu); 4489 } 4490 4491 /* Emulate the VMCLEAR instruction */ 4492 static int handle_vmclear(struct kvm_vcpu *vcpu) 4493 { 4494 struct vcpu_vmx *vmx = to_vmx(vcpu); 4495 u32 zero = 0; 4496 gpa_t vmptr; 4497 u64 evmcs_gpa; 4498 4499 if (!nested_vmx_check_permission(vcpu)) 4500 return 1; 4501 4502 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4503 return 1; 4504 4505 if (!page_address_valid(vcpu, vmptr)) 4506 return nested_vmx_failValid(vcpu, 4507 VMXERR_VMCLEAR_INVALID_ADDRESS); 4508 4509 if (vmptr == vmx->nested.vmxon_ptr) 4510 return nested_vmx_failValid(vcpu, 4511 VMXERR_VMCLEAR_VMXON_POINTER); 4512 4513 /* 4514 * When Enlightened VMEntry is enabled on the calling CPU we treat 4515 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 4516 * way to distinguish it from VMCS12) and we must not corrupt it by 4517 * writing to the non-existent 'launch_state' field. The area doesn't 4518 * have to be the currently active EVMCS on the calling CPU and there's 4519 * nothing KVM has to do to transition it from 'active' to 'non-active' 4520 * state. It is possible that the area will stay mapped as 4521 * vmx->nested.hv_evmcs but this shouldn't be a problem. 4522 */ 4523 if (likely(!vmx->nested.enlightened_vmcs_enabled || 4524 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 4525 if (vmptr == vmx->nested.current_vmptr) 4526 nested_release_vmcs12(vcpu); 4527 4528 kvm_vcpu_write_guest(vcpu, 4529 vmptr + offsetof(struct vmcs12, 4530 launch_state), 4531 &zero, sizeof(zero)); 4532 } 4533 4534 return nested_vmx_succeed(vcpu); 4535 } 4536 4537 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); 4538 4539 /* Emulate the VMLAUNCH instruction */ 4540 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 4541 { 4542 return nested_vmx_run(vcpu, true); 4543 } 4544 4545 /* Emulate the VMRESUME instruction */ 4546 static int handle_vmresume(struct kvm_vcpu *vcpu) 4547 { 4548 4549 return nested_vmx_run(vcpu, false); 4550 } 4551 4552 static int handle_vmread(struct kvm_vcpu *vcpu) 4553 { 4554 unsigned long field; 4555 u64 field_value; 4556 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4557 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4558 int len; 4559 gva_t gva = 0; 4560 struct vmcs12 *vmcs12; 4561 struct x86_exception e; 4562 short offset; 4563 4564 if (!nested_vmx_check_permission(vcpu)) 4565 return 1; 4566 4567 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) 4568 return nested_vmx_failInvalid(vcpu); 4569 4570 if (!is_guest_mode(vcpu)) 4571 vmcs12 = get_vmcs12(vcpu); 4572 else { 4573 /* 4574 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD 4575 * to shadowed-field sets the ALU flags for VMfailInvalid. 4576 */ 4577 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4578 return nested_vmx_failInvalid(vcpu); 4579 vmcs12 = get_shadow_vmcs12(vcpu); 4580 } 4581 4582 /* Decode instruction info and find the field to read */ 4583 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4584 4585 offset = vmcs_field_to_offset(field); 4586 if (offset < 0) 4587 return nested_vmx_failValid(vcpu, 4588 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4589 4590 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 4591 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4592 4593 /* Read the field, zero-extended to a u64 field_value */ 4594 field_value = vmcs12_read_any(vmcs12, field, offset); 4595 4596 /* 4597 * Now copy part of this value to register or memory, as requested. 4598 * Note that the number of bits actually copied is 32 or 64 depending 4599 * on the guest's mode (32 or 64 bit), not on the given field's length. 4600 */ 4601 if (vmx_instruction_info & (1u << 10)) { 4602 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 4603 field_value); 4604 } else { 4605 len = is_64_bit_mode(vcpu) ? 8 : 4; 4606 if (get_vmx_mem_address(vcpu, exit_qualification, 4607 vmx_instruction_info, true, len, &gva)) 4608 return 1; 4609 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4610 if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) 4611 kvm_inject_page_fault(vcpu, &e); 4612 } 4613 4614 return nested_vmx_succeed(vcpu); 4615 } 4616 4617 static bool is_shadow_field_rw(unsigned long field) 4618 { 4619 switch (field) { 4620 #define SHADOW_FIELD_RW(x, y) case x: 4621 #include "vmcs_shadow_fields.h" 4622 return true; 4623 default: 4624 break; 4625 } 4626 return false; 4627 } 4628 4629 static bool is_shadow_field_ro(unsigned long field) 4630 { 4631 switch (field) { 4632 #define SHADOW_FIELD_RO(x, y) case x: 4633 #include "vmcs_shadow_fields.h" 4634 return true; 4635 default: 4636 break; 4637 } 4638 return false; 4639 } 4640 4641 static int handle_vmwrite(struct kvm_vcpu *vcpu) 4642 { 4643 unsigned long field; 4644 int len; 4645 gva_t gva; 4646 struct vcpu_vmx *vmx = to_vmx(vcpu); 4647 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4648 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4649 4650 /* The value to write might be 32 or 64 bits, depending on L1's long 4651 * mode, and eventually we need to write that into a field of several 4652 * possible lengths. The code below first zero-extends the value to 64 4653 * bit (field_value), and then copies only the appropriate number of 4654 * bits into the vmcs12 field. 4655 */ 4656 u64 field_value = 0; 4657 struct x86_exception e; 4658 struct vmcs12 *vmcs12; 4659 short offset; 4660 4661 if (!nested_vmx_check_permission(vcpu)) 4662 return 1; 4663 4664 if (vmx->nested.current_vmptr == -1ull) 4665 return nested_vmx_failInvalid(vcpu); 4666 4667 if (vmx_instruction_info & (1u << 10)) 4668 field_value = kvm_register_readl(vcpu, 4669 (((vmx_instruction_info) >> 3) & 0xf)); 4670 else { 4671 len = is_64_bit_mode(vcpu) ? 8 : 4; 4672 if (get_vmx_mem_address(vcpu, exit_qualification, 4673 vmx_instruction_info, false, len, &gva)) 4674 return 1; 4675 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { 4676 kvm_inject_page_fault(vcpu, &e); 4677 return 1; 4678 } 4679 } 4680 4681 4682 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4683 /* 4684 * If the vCPU supports "VMWRITE to any supported field in the 4685 * VMCS," then the "read-only" fields are actually read/write. 4686 */ 4687 if (vmcs_field_readonly(field) && 4688 !nested_cpu_has_vmwrite_any_field(vcpu)) 4689 return nested_vmx_failValid(vcpu, 4690 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 4691 4692 if (!is_guest_mode(vcpu)) { 4693 vmcs12 = get_vmcs12(vcpu); 4694 4695 /* 4696 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 4697 * vmcs12, else we may crush a field or consume a stale value. 4698 */ 4699 if (!is_shadow_field_rw(field)) 4700 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4701 } else { 4702 /* 4703 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE 4704 * to shadowed-field sets the ALU flags for VMfailInvalid. 4705 */ 4706 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4707 return nested_vmx_failInvalid(vcpu); 4708 vmcs12 = get_shadow_vmcs12(vcpu); 4709 } 4710 4711 offset = vmcs_field_to_offset(field); 4712 if (offset < 0) 4713 return nested_vmx_failValid(vcpu, 4714 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4715 4716 /* 4717 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 4718 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 4719 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 4720 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 4721 * from L1 will return a different value than VMREAD from L2 (L1 sees 4722 * the stripped down value, L2 sees the full value as stored by KVM). 4723 */ 4724 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 4725 field_value &= 0x1f0ff; 4726 4727 vmcs12_write_any(vmcs12, field, offset, field_value); 4728 4729 /* 4730 * Do not track vmcs12 dirty-state if in guest-mode as we actually 4731 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 4732 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 4733 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 4734 */ 4735 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 4736 /* 4737 * L1 can read these fields without exiting, ensure the 4738 * shadow VMCS is up-to-date. 4739 */ 4740 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 4741 preempt_disable(); 4742 vmcs_load(vmx->vmcs01.shadow_vmcs); 4743 4744 __vmcs_writel(field, field_value); 4745 4746 vmcs_clear(vmx->vmcs01.shadow_vmcs); 4747 vmcs_load(vmx->loaded_vmcs->vmcs); 4748 preempt_enable(); 4749 } 4750 vmx->nested.dirty_vmcs12 = true; 4751 } 4752 4753 return nested_vmx_succeed(vcpu); 4754 } 4755 4756 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 4757 { 4758 vmx->nested.current_vmptr = vmptr; 4759 if (enable_shadow_vmcs) { 4760 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 4761 vmcs_write64(VMCS_LINK_POINTER, 4762 __pa(vmx->vmcs01.shadow_vmcs)); 4763 vmx->nested.need_vmcs12_to_shadow_sync = true; 4764 } 4765 vmx->nested.dirty_vmcs12 = true; 4766 } 4767 4768 /* Emulate the VMPTRLD instruction */ 4769 static int handle_vmptrld(struct kvm_vcpu *vcpu) 4770 { 4771 struct vcpu_vmx *vmx = to_vmx(vcpu); 4772 gpa_t vmptr; 4773 4774 if (!nested_vmx_check_permission(vcpu)) 4775 return 1; 4776 4777 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4778 return 1; 4779 4780 if (!page_address_valid(vcpu, vmptr)) 4781 return nested_vmx_failValid(vcpu, 4782 VMXERR_VMPTRLD_INVALID_ADDRESS); 4783 4784 if (vmptr == vmx->nested.vmxon_ptr) 4785 return nested_vmx_failValid(vcpu, 4786 VMXERR_VMPTRLD_VMXON_POINTER); 4787 4788 /* Forbid normal VMPTRLD if Enlightened version was used */ 4789 if (vmx->nested.hv_evmcs) 4790 return 1; 4791 4792 if (vmx->nested.current_vmptr != vmptr) { 4793 struct kvm_host_map map; 4794 struct vmcs12 *new_vmcs12; 4795 4796 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 4797 /* 4798 * Reads from an unbacked page return all 1s, 4799 * which means that the 32 bits located at the 4800 * given physical address won't match the required 4801 * VMCS12_REVISION identifier. 4802 */ 4803 return nested_vmx_failValid(vcpu, 4804 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4805 } 4806 4807 new_vmcs12 = map.hva; 4808 4809 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 4810 (new_vmcs12->hdr.shadow_vmcs && 4811 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 4812 kvm_vcpu_unmap(vcpu, &map, false); 4813 return nested_vmx_failValid(vcpu, 4814 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4815 } 4816 4817 nested_release_vmcs12(vcpu); 4818 4819 /* 4820 * Load VMCS12 from guest memory since it is not already 4821 * cached. 4822 */ 4823 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 4824 kvm_vcpu_unmap(vcpu, &map, false); 4825 4826 set_current_vmptr(vmx, vmptr); 4827 } 4828 4829 return nested_vmx_succeed(vcpu); 4830 } 4831 4832 /* Emulate the VMPTRST instruction */ 4833 static int handle_vmptrst(struct kvm_vcpu *vcpu) 4834 { 4835 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); 4836 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4837 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 4838 struct x86_exception e; 4839 gva_t gva; 4840 4841 if (!nested_vmx_check_permission(vcpu)) 4842 return 1; 4843 4844 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) 4845 return 1; 4846 4847 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 4848 true, sizeof(gpa_t), &gva)) 4849 return 1; 4850 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 4851 if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 4852 sizeof(gpa_t), &e)) { 4853 kvm_inject_page_fault(vcpu, &e); 4854 return 1; 4855 } 4856 return nested_vmx_succeed(vcpu); 4857 } 4858 4859 /* Emulate the INVEPT instruction */ 4860 static int handle_invept(struct kvm_vcpu *vcpu) 4861 { 4862 struct vcpu_vmx *vmx = to_vmx(vcpu); 4863 u32 vmx_instruction_info, types; 4864 unsigned long type; 4865 gva_t gva; 4866 struct x86_exception e; 4867 struct { 4868 u64 eptp, gpa; 4869 } operand; 4870 4871 if (!(vmx->nested.msrs.secondary_ctls_high & 4872 SECONDARY_EXEC_ENABLE_EPT) || 4873 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 4874 kvm_queue_exception(vcpu, UD_VECTOR); 4875 return 1; 4876 } 4877 4878 if (!nested_vmx_check_permission(vcpu)) 4879 return 1; 4880 4881 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4882 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 4883 4884 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 4885 4886 if (type >= 32 || !(types & (1 << type))) 4887 return nested_vmx_failValid(vcpu, 4888 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4889 4890 /* According to the Intel VMX instruction reference, the memory 4891 * operand is read even if it isn't needed (e.g., for type==global) 4892 */ 4893 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4894 vmx_instruction_info, false, sizeof(operand), &gva)) 4895 return 1; 4896 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 4897 kvm_inject_page_fault(vcpu, &e); 4898 return 1; 4899 } 4900 4901 switch (type) { 4902 case VMX_EPT_EXTENT_GLOBAL: 4903 case VMX_EPT_EXTENT_CONTEXT: 4904 /* 4905 * TODO: Sync the necessary shadow EPT roots here, rather than 4906 * at the next emulated VM-entry. 4907 */ 4908 break; 4909 default: 4910 BUG_ON(1); 4911 break; 4912 } 4913 4914 return nested_vmx_succeed(vcpu); 4915 } 4916 4917 static int handle_invvpid(struct kvm_vcpu *vcpu) 4918 { 4919 struct vcpu_vmx *vmx = to_vmx(vcpu); 4920 u32 vmx_instruction_info; 4921 unsigned long type, types; 4922 gva_t gva; 4923 struct x86_exception e; 4924 struct { 4925 u64 vpid; 4926 u64 gla; 4927 } operand; 4928 u16 vpid02; 4929 4930 if (!(vmx->nested.msrs.secondary_ctls_high & 4931 SECONDARY_EXEC_ENABLE_VPID) || 4932 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 4933 kvm_queue_exception(vcpu, UD_VECTOR); 4934 return 1; 4935 } 4936 4937 if (!nested_vmx_check_permission(vcpu)) 4938 return 1; 4939 4940 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4941 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 4942 4943 types = (vmx->nested.msrs.vpid_caps & 4944 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 4945 4946 if (type >= 32 || !(types & (1 << type))) 4947 return nested_vmx_failValid(vcpu, 4948 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4949 4950 /* according to the intel vmx instruction reference, the memory 4951 * operand is read even if it isn't needed (e.g., for type==global) 4952 */ 4953 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4954 vmx_instruction_info, false, sizeof(operand), &gva)) 4955 return 1; 4956 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 4957 kvm_inject_page_fault(vcpu, &e); 4958 return 1; 4959 } 4960 if (operand.vpid >> 16) 4961 return nested_vmx_failValid(vcpu, 4962 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4963 4964 vpid02 = nested_get_vpid02(vcpu); 4965 switch (type) { 4966 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 4967 if (!operand.vpid || 4968 is_noncanonical_address(operand.gla, vcpu)) 4969 return nested_vmx_failValid(vcpu, 4970 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4971 if (cpu_has_vmx_invvpid_individual_addr()) { 4972 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, 4973 vpid02, operand.gla); 4974 } else 4975 __vmx_flush_tlb(vcpu, vpid02, false); 4976 break; 4977 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 4978 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 4979 if (!operand.vpid) 4980 return nested_vmx_failValid(vcpu, 4981 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4982 __vmx_flush_tlb(vcpu, vpid02, false); 4983 break; 4984 case VMX_VPID_EXTENT_ALL_CONTEXT: 4985 __vmx_flush_tlb(vcpu, vpid02, false); 4986 break; 4987 default: 4988 WARN_ON_ONCE(1); 4989 return kvm_skip_emulated_instruction(vcpu); 4990 } 4991 4992 return nested_vmx_succeed(vcpu); 4993 } 4994 4995 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 4996 struct vmcs12 *vmcs12) 4997 { 4998 u32 index = kvm_rcx_read(vcpu); 4999 u64 address; 5000 bool accessed_dirty; 5001 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 5002 5003 if (!nested_cpu_has_eptp_switching(vmcs12) || 5004 !nested_cpu_has_ept(vmcs12)) 5005 return 1; 5006 5007 if (index >= VMFUNC_EPTP_ENTRIES) 5008 return 1; 5009 5010 5011 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5012 &address, index * 8, 8)) 5013 return 1; 5014 5015 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); 5016 5017 /* 5018 * If the (L2) guest does a vmfunc to the currently 5019 * active ept pointer, we don't have to do anything else 5020 */ 5021 if (vmcs12->ept_pointer != address) { 5022 if (!valid_ept_address(vcpu, address)) 5023 return 1; 5024 5025 kvm_mmu_unload(vcpu); 5026 mmu->ept_ad = accessed_dirty; 5027 mmu->mmu_role.base.ad_disabled = !accessed_dirty; 5028 vmcs12->ept_pointer = address; 5029 /* 5030 * TODO: Check what's the correct approach in case 5031 * mmu reload fails. Currently, we just let the next 5032 * reload potentially fail 5033 */ 5034 kvm_mmu_reload(vcpu); 5035 } 5036 5037 return 0; 5038 } 5039 5040 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5041 { 5042 struct vcpu_vmx *vmx = to_vmx(vcpu); 5043 struct vmcs12 *vmcs12; 5044 u32 function = kvm_rax_read(vcpu); 5045 5046 /* 5047 * VMFUNC is only supported for nested guests, but we always enable the 5048 * secondary control for simplicity; for non-nested mode, fake that we 5049 * didn't by injecting #UD. 5050 */ 5051 if (!is_guest_mode(vcpu)) { 5052 kvm_queue_exception(vcpu, UD_VECTOR); 5053 return 1; 5054 } 5055 5056 vmcs12 = get_vmcs12(vcpu); 5057 if ((vmcs12->vm_function_control & (1 << function)) == 0) 5058 goto fail; 5059 5060 switch (function) { 5061 case 0: 5062 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5063 goto fail; 5064 break; 5065 default: 5066 goto fail; 5067 } 5068 return kvm_skip_emulated_instruction(vcpu); 5069 5070 fail: 5071 nested_vmx_vmexit(vcpu, vmx->exit_reason, 5072 vmcs_read32(VM_EXIT_INTR_INFO), 5073 vmcs_readl(EXIT_QUALIFICATION)); 5074 return 1; 5075 } 5076 5077 5078 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5079 struct vmcs12 *vmcs12) 5080 { 5081 unsigned long exit_qualification; 5082 gpa_t bitmap, last_bitmap; 5083 unsigned int port; 5084 int size; 5085 u8 b; 5086 5087 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5088 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5089 5090 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5091 5092 port = exit_qualification >> 16; 5093 size = (exit_qualification & 7) + 1; 5094 5095 last_bitmap = (gpa_t)-1; 5096 b = -1; 5097 5098 while (size > 0) { 5099 if (port < 0x8000) 5100 bitmap = vmcs12->io_bitmap_a; 5101 else if (port < 0x10000) 5102 bitmap = vmcs12->io_bitmap_b; 5103 else 5104 return true; 5105 bitmap += (port & 0x7fff) / 8; 5106 5107 if (last_bitmap != bitmap) 5108 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5109 return true; 5110 if (b & (1 << (port & 7))) 5111 return true; 5112 5113 port++; 5114 size--; 5115 last_bitmap = bitmap; 5116 } 5117 5118 return false; 5119 } 5120 5121 /* 5122 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 5123 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5124 * disinterest in the current event (read or write a specific MSR) by using an 5125 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5126 */ 5127 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5128 struct vmcs12 *vmcs12, u32 exit_reason) 5129 { 5130 u32 msr_index = kvm_rcx_read(vcpu); 5131 gpa_t bitmap; 5132 5133 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5134 return true; 5135 5136 /* 5137 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5138 * for the four combinations of read/write and low/high MSR numbers. 5139 * First we need to figure out which of the four to use: 5140 */ 5141 bitmap = vmcs12->msr_bitmap; 5142 if (exit_reason == EXIT_REASON_MSR_WRITE) 5143 bitmap += 2048; 5144 if (msr_index >= 0xc0000000) { 5145 msr_index -= 0xc0000000; 5146 bitmap += 1024; 5147 } 5148 5149 /* Then read the msr_index'th bit from this bitmap: */ 5150 if (msr_index < 1024*8) { 5151 unsigned char b; 5152 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5153 return true; 5154 return 1 & (b >> (msr_index & 7)); 5155 } else 5156 return true; /* let L1 handle the wrong parameter */ 5157 } 5158 5159 /* 5160 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5161 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5162 * intercept (via guest_host_mask etc.) the current event. 5163 */ 5164 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5165 struct vmcs12 *vmcs12) 5166 { 5167 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5168 int cr = exit_qualification & 15; 5169 int reg; 5170 unsigned long val; 5171 5172 switch ((exit_qualification >> 4) & 3) { 5173 case 0: /* mov to cr */ 5174 reg = (exit_qualification >> 8) & 15; 5175 val = kvm_register_readl(vcpu, reg); 5176 switch (cr) { 5177 case 0: 5178 if (vmcs12->cr0_guest_host_mask & 5179 (val ^ vmcs12->cr0_read_shadow)) 5180 return true; 5181 break; 5182 case 3: 5183 if ((vmcs12->cr3_target_count >= 1 && 5184 vmcs12->cr3_target_value0 == val) || 5185 (vmcs12->cr3_target_count >= 2 && 5186 vmcs12->cr3_target_value1 == val) || 5187 (vmcs12->cr3_target_count >= 3 && 5188 vmcs12->cr3_target_value2 == val) || 5189 (vmcs12->cr3_target_count >= 4 && 5190 vmcs12->cr3_target_value3 == val)) 5191 return false; 5192 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5193 return true; 5194 break; 5195 case 4: 5196 if (vmcs12->cr4_guest_host_mask & 5197 (vmcs12->cr4_read_shadow ^ val)) 5198 return true; 5199 break; 5200 case 8: 5201 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5202 return true; 5203 break; 5204 } 5205 break; 5206 case 2: /* clts */ 5207 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5208 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5209 return true; 5210 break; 5211 case 1: /* mov from cr */ 5212 switch (cr) { 5213 case 3: 5214 if (vmcs12->cpu_based_vm_exec_control & 5215 CPU_BASED_CR3_STORE_EXITING) 5216 return true; 5217 break; 5218 case 8: 5219 if (vmcs12->cpu_based_vm_exec_control & 5220 CPU_BASED_CR8_STORE_EXITING) 5221 return true; 5222 break; 5223 } 5224 break; 5225 case 3: /* lmsw */ 5226 /* 5227 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5228 * cr0. Other attempted changes are ignored, with no exit. 5229 */ 5230 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5231 if (vmcs12->cr0_guest_host_mask & 0xe & 5232 (val ^ vmcs12->cr0_read_shadow)) 5233 return true; 5234 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5235 !(vmcs12->cr0_read_shadow & 0x1) && 5236 (val & 0x1)) 5237 return true; 5238 break; 5239 } 5240 return false; 5241 } 5242 5243 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5244 struct vmcs12 *vmcs12, gpa_t bitmap) 5245 { 5246 u32 vmx_instruction_info; 5247 unsigned long field; 5248 u8 b; 5249 5250 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5251 return true; 5252 5253 /* Decode instruction info and find the field to access */ 5254 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5255 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5256 5257 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5258 if (field >> 15) 5259 return true; 5260 5261 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5262 return true; 5263 5264 return 1 & (b >> (field & 7)); 5265 } 5266 5267 /* 5268 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 5269 * should handle it ourselves in L0 (and then continue L2). Only call this 5270 * when in is_guest_mode (L2). 5271 */ 5272 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) 5273 { 5274 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 5275 struct vcpu_vmx *vmx = to_vmx(vcpu); 5276 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5277 5278 if (vmx->nested.nested_run_pending) 5279 return false; 5280 5281 if (unlikely(vmx->fail)) { 5282 trace_kvm_nested_vmenter_failed( 5283 "hardware VM-instruction error: ", 5284 vmcs_read32(VM_INSTRUCTION_ERROR)); 5285 return true; 5286 } 5287 5288 /* 5289 * The host physical addresses of some pages of guest memory 5290 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 5291 * Page). The CPU may write to these pages via their host 5292 * physical address while L2 is running, bypassing any 5293 * address-translation-based dirty tracking (e.g. EPT write 5294 * protection). 5295 * 5296 * Mark them dirty on every exit from L2 to prevent them from 5297 * getting out of sync with dirty tracking. 5298 */ 5299 nested_mark_vmcs12_pages_dirty(vcpu); 5300 5301 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 5302 vmcs_readl(EXIT_QUALIFICATION), 5303 vmx->idt_vectoring_info, 5304 intr_info, 5305 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 5306 KVM_ISA_VMX); 5307 5308 switch (exit_reason) { 5309 case EXIT_REASON_EXCEPTION_NMI: 5310 if (is_nmi(intr_info)) 5311 return false; 5312 else if (is_page_fault(intr_info)) 5313 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; 5314 else if (is_debug(intr_info) && 5315 vcpu->guest_debug & 5316 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5317 return false; 5318 else if (is_breakpoint(intr_info) && 5319 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5320 return false; 5321 return vmcs12->exception_bitmap & 5322 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5323 case EXIT_REASON_EXTERNAL_INTERRUPT: 5324 return false; 5325 case EXIT_REASON_TRIPLE_FAULT: 5326 return true; 5327 case EXIT_REASON_PENDING_INTERRUPT: 5328 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 5329 case EXIT_REASON_NMI_WINDOW: 5330 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 5331 case EXIT_REASON_TASK_SWITCH: 5332 return true; 5333 case EXIT_REASON_CPUID: 5334 return true; 5335 case EXIT_REASON_HLT: 5336 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5337 case EXIT_REASON_INVD: 5338 return true; 5339 case EXIT_REASON_INVLPG: 5340 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5341 case EXIT_REASON_RDPMC: 5342 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5343 case EXIT_REASON_RDRAND: 5344 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5345 case EXIT_REASON_RDSEED: 5346 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5347 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5348 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5349 case EXIT_REASON_VMREAD: 5350 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5351 vmcs12->vmread_bitmap); 5352 case EXIT_REASON_VMWRITE: 5353 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5354 vmcs12->vmwrite_bitmap); 5355 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5356 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5357 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5358 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5359 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5360 /* 5361 * VMX instructions trap unconditionally. This allows L1 to 5362 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5363 */ 5364 return true; 5365 case EXIT_REASON_CR_ACCESS: 5366 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5367 case EXIT_REASON_DR_ACCESS: 5368 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5369 case EXIT_REASON_IO_INSTRUCTION: 5370 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5371 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5372 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5373 case EXIT_REASON_MSR_READ: 5374 case EXIT_REASON_MSR_WRITE: 5375 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5376 case EXIT_REASON_INVALID_STATE: 5377 return true; 5378 case EXIT_REASON_MWAIT_INSTRUCTION: 5379 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5380 case EXIT_REASON_MONITOR_TRAP_FLAG: 5381 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); 5382 case EXIT_REASON_MONITOR_INSTRUCTION: 5383 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5384 case EXIT_REASON_PAUSE_INSTRUCTION: 5385 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5386 nested_cpu_has2(vmcs12, 5387 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5388 case EXIT_REASON_MCE_DURING_VMENTRY: 5389 return false; 5390 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5391 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5392 case EXIT_REASON_APIC_ACCESS: 5393 case EXIT_REASON_APIC_WRITE: 5394 case EXIT_REASON_EOI_INDUCED: 5395 /* 5396 * The controls for "virtualize APIC accesses," "APIC- 5397 * register virtualization," and "virtual-interrupt 5398 * delivery" only come from vmcs12. 5399 */ 5400 return true; 5401 case EXIT_REASON_EPT_VIOLATION: 5402 /* 5403 * L0 always deals with the EPT violation. If nested EPT is 5404 * used, and the nested mmu code discovers that the address is 5405 * missing in the guest EPT table (EPT12), the EPT violation 5406 * will be injected with nested_ept_inject_page_fault() 5407 */ 5408 return false; 5409 case EXIT_REASON_EPT_MISCONFIG: 5410 /* 5411 * L2 never uses directly L1's EPT, but rather L0's own EPT 5412 * table (shadow on EPT) or a merged EPT table that L0 built 5413 * (EPT on EPT). So any problems with the structure of the 5414 * table is L0's fault. 5415 */ 5416 return false; 5417 case EXIT_REASON_INVPCID: 5418 return 5419 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5420 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5421 case EXIT_REASON_WBINVD: 5422 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5423 case EXIT_REASON_XSETBV: 5424 return true; 5425 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5426 /* 5427 * This should never happen, since it is not possible to 5428 * set XSS to a non-zero value---neither in L1 nor in L2. 5429 * If if it were, XSS would have to be checked against 5430 * the XSS exit bitmap in vmcs12. 5431 */ 5432 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5433 case EXIT_REASON_PREEMPTION_TIMER: 5434 return false; 5435 case EXIT_REASON_PML_FULL: 5436 /* We emulate PML support to L1. */ 5437 return false; 5438 case EXIT_REASON_VMFUNC: 5439 /* VM functions are emulated through L2->L0 vmexits. */ 5440 return false; 5441 case EXIT_REASON_ENCLS: 5442 /* SGX is never exposed to L1 */ 5443 return false; 5444 default: 5445 return true; 5446 } 5447 } 5448 5449 5450 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 5451 struct kvm_nested_state __user *user_kvm_nested_state, 5452 u32 user_data_size) 5453 { 5454 struct vcpu_vmx *vmx; 5455 struct vmcs12 *vmcs12; 5456 struct kvm_nested_state kvm_state = { 5457 .flags = 0, 5458 .format = KVM_STATE_NESTED_FORMAT_VMX, 5459 .size = sizeof(kvm_state), 5460 .hdr.vmx.vmxon_pa = -1ull, 5461 .hdr.vmx.vmcs12_pa = -1ull, 5462 }; 5463 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5464 &user_kvm_nested_state->data.vmx[0]; 5465 5466 if (!vcpu) 5467 return kvm_state.size + sizeof(*user_vmx_nested_state); 5468 5469 vmx = to_vmx(vcpu); 5470 vmcs12 = get_vmcs12(vcpu); 5471 5472 if (nested_vmx_allowed(vcpu) && 5473 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 5474 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 5475 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 5476 5477 if (vmx_has_valid_vmcs12(vcpu)) { 5478 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 5479 5480 if (vmx->nested.hv_evmcs) 5481 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 5482 5483 if (is_guest_mode(vcpu) && 5484 nested_cpu_has_shadow_vmcs(vmcs12) && 5485 vmcs12->vmcs_link_pointer != -1ull) 5486 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 5487 } 5488 5489 if (vmx->nested.smm.vmxon) 5490 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 5491 5492 if (vmx->nested.smm.guest_mode) 5493 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 5494 5495 if (is_guest_mode(vcpu)) { 5496 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 5497 5498 if (vmx->nested.nested_run_pending) 5499 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 5500 } 5501 } 5502 5503 if (user_data_size < kvm_state.size) 5504 goto out; 5505 5506 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 5507 return -EFAULT; 5508 5509 if (!vmx_has_valid_vmcs12(vcpu)) 5510 goto out; 5511 5512 /* 5513 * When running L2, the authoritative vmcs12 state is in the 5514 * vmcs02. When running L1, the authoritative vmcs12 state is 5515 * in the shadow or enlightened vmcs linked to vmcs01, unless 5516 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 5517 * vmcs12 state is in the vmcs12 already. 5518 */ 5519 if (is_guest_mode(vcpu)) { 5520 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5521 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5522 } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { 5523 if (vmx->nested.hv_evmcs) 5524 copy_enlightened_to_vmcs12(vmx); 5525 else if (enable_shadow_vmcs) 5526 copy_shadow_to_vmcs12(vmx); 5527 } 5528 5529 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 5530 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 5531 5532 /* 5533 * Copy over the full allocated size of vmcs12 rather than just the size 5534 * of the struct. 5535 */ 5536 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 5537 return -EFAULT; 5538 5539 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5540 vmcs12->vmcs_link_pointer != -1ull) { 5541 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 5542 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 5543 return -EFAULT; 5544 } 5545 5546 out: 5547 return kvm_state.size; 5548 } 5549 5550 /* 5551 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 5552 */ 5553 void vmx_leave_nested(struct kvm_vcpu *vcpu) 5554 { 5555 if (is_guest_mode(vcpu)) { 5556 to_vmx(vcpu)->nested.nested_run_pending = 0; 5557 nested_vmx_vmexit(vcpu, -1, 0, 0); 5558 } 5559 free_nested(vcpu); 5560 } 5561 5562 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 5563 struct kvm_nested_state __user *user_kvm_nested_state, 5564 struct kvm_nested_state *kvm_state) 5565 { 5566 struct vcpu_vmx *vmx = to_vmx(vcpu); 5567 struct vmcs12 *vmcs12; 5568 u32 exit_qual; 5569 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5570 &user_kvm_nested_state->data.vmx[0]; 5571 int ret; 5572 5573 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 5574 return -EINVAL; 5575 5576 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { 5577 if (kvm_state->hdr.vmx.smm.flags) 5578 return -EINVAL; 5579 5580 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) 5581 return -EINVAL; 5582 5583 /* 5584 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 5585 * enable eVMCS capability on vCPU. However, since then 5586 * code was changed such that flag signals vmcs12 should 5587 * be copied into eVMCS in guest memory. 5588 * 5589 * To preserve backwards compatability, allow user 5590 * to set this flag even when there is no VMXON region. 5591 */ 5592 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 5593 return -EINVAL; 5594 } else { 5595 if (!nested_vmx_allowed(vcpu)) 5596 return -EINVAL; 5597 5598 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 5599 return -EINVAL; 5600 } 5601 5602 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5603 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5604 return -EINVAL; 5605 5606 if (kvm_state->hdr.vmx.smm.flags & 5607 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 5608 return -EINVAL; 5609 5610 /* 5611 * SMM temporarily disables VMX, so we cannot be in guest mode, 5612 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 5613 * must be zero. 5614 */ 5615 if (is_smm(vcpu) ? 5616 (kvm_state->flags & 5617 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 5618 : kvm_state->hdr.vmx.smm.flags) 5619 return -EINVAL; 5620 5621 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5622 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 5623 return -EINVAL; 5624 5625 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 5626 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 5627 return -EINVAL; 5628 5629 vmx_leave_nested(vcpu); 5630 5631 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) 5632 return 0; 5633 5634 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 5635 ret = enter_vmx_operation(vcpu); 5636 if (ret) 5637 return ret; 5638 5639 /* Empty 'VMXON' state is permitted */ 5640 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) 5641 return 0; 5642 5643 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { 5644 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 5645 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 5646 return -EINVAL; 5647 5648 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 5649 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 5650 /* 5651 * Sync eVMCS upon entry as we may not have 5652 * HV_X64_MSR_VP_ASSIST_PAGE set up yet. 5653 */ 5654 vmx->nested.need_vmcs12_to_shadow_sync = true; 5655 } else { 5656 return -EINVAL; 5657 } 5658 5659 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 5660 vmx->nested.smm.vmxon = true; 5661 vmx->nested.vmxon = false; 5662 5663 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 5664 vmx->nested.smm.guest_mode = true; 5665 } 5666 5667 vmcs12 = get_vmcs12(vcpu); 5668 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 5669 return -EFAULT; 5670 5671 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 5672 return -EINVAL; 5673 5674 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5675 return 0; 5676 5677 vmx->nested.nested_run_pending = 5678 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 5679 5680 ret = -EINVAL; 5681 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5682 vmcs12->vmcs_link_pointer != -1ull) { 5683 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 5684 5685 if (kvm_state->size < 5686 sizeof(*kvm_state) + 5687 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 5688 goto error_guest_mode; 5689 5690 if (copy_from_user(shadow_vmcs12, 5691 user_vmx_nested_state->shadow_vmcs12, 5692 sizeof(*shadow_vmcs12))) { 5693 ret = -EFAULT; 5694 goto error_guest_mode; 5695 } 5696 5697 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 5698 !shadow_vmcs12->hdr.shadow_vmcs) 5699 goto error_guest_mode; 5700 } 5701 5702 if (nested_vmx_check_controls(vcpu, vmcs12) || 5703 nested_vmx_check_host_state(vcpu, vmcs12) || 5704 nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 5705 goto error_guest_mode; 5706 5707 vmx->nested.dirty_vmcs12 = true; 5708 ret = nested_vmx_enter_non_root_mode(vcpu, false); 5709 if (ret) 5710 goto error_guest_mode; 5711 5712 return 0; 5713 5714 error_guest_mode: 5715 vmx->nested.nested_run_pending = 0; 5716 return ret; 5717 } 5718 5719 void nested_vmx_vcpu_setup(void) 5720 { 5721 if (enable_shadow_vmcs) { 5722 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 5723 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 5724 } 5725 } 5726 5727 /* 5728 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 5729 * returned for the various VMX controls MSRs when nested VMX is enabled. 5730 * The same values should also be used to verify that vmcs12 control fields are 5731 * valid during nested entry from L1 to L2. 5732 * Each of these control msrs has a low and high 32-bit half: A low bit is on 5733 * if the corresponding bit in the (32-bit) control field *must* be on, and a 5734 * bit in the high half is on if the corresponding bit in the control field 5735 * may be on. See also vmx_control_verify(). 5736 */ 5737 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, 5738 bool apicv) 5739 { 5740 /* 5741 * Note that as a general rule, the high half of the MSRs (bits in 5742 * the control fields which may be 1) should be initialized by the 5743 * intersection of the underlying hardware's MSR (i.e., features which 5744 * can be supported) and the list of features we want to expose - 5745 * because they are known to be properly supported in our code. 5746 * Also, usually, the low half of the MSRs (bits which must be 1) can 5747 * be set to 0, meaning that L1 may turn off any of these bits. The 5748 * reason is that if one of these bits is necessary, it will appear 5749 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 5750 * fields of vmcs01 and vmcs02, will turn these bits off - and 5751 * nested_vmx_exit_reflected() will not pass related exits to L1. 5752 * These rules have exceptions below. 5753 */ 5754 5755 /* pin-based controls */ 5756 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 5757 msrs->pinbased_ctls_low, 5758 msrs->pinbased_ctls_high); 5759 msrs->pinbased_ctls_low |= 5760 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5761 msrs->pinbased_ctls_high &= 5762 PIN_BASED_EXT_INTR_MASK | 5763 PIN_BASED_NMI_EXITING | 5764 PIN_BASED_VIRTUAL_NMIS | 5765 (apicv ? PIN_BASED_POSTED_INTR : 0); 5766 msrs->pinbased_ctls_high |= 5767 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 5768 PIN_BASED_VMX_PREEMPTION_TIMER; 5769 5770 /* exit controls */ 5771 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 5772 msrs->exit_ctls_low, 5773 msrs->exit_ctls_high); 5774 msrs->exit_ctls_low = 5775 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 5776 5777 msrs->exit_ctls_high &= 5778 #ifdef CONFIG_X86_64 5779 VM_EXIT_HOST_ADDR_SPACE_SIZE | 5780 #endif 5781 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 5782 msrs->exit_ctls_high |= 5783 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 5784 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 5785 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 5786 5787 /* We support free control of debug control saving. */ 5788 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 5789 5790 /* entry controls */ 5791 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 5792 msrs->entry_ctls_low, 5793 msrs->entry_ctls_high); 5794 msrs->entry_ctls_low = 5795 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 5796 msrs->entry_ctls_high &= 5797 #ifdef CONFIG_X86_64 5798 VM_ENTRY_IA32E_MODE | 5799 #endif 5800 VM_ENTRY_LOAD_IA32_PAT; 5801 msrs->entry_ctls_high |= 5802 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 5803 5804 /* We support free control of debug control loading. */ 5805 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 5806 5807 /* cpu-based controls */ 5808 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 5809 msrs->procbased_ctls_low, 5810 msrs->procbased_ctls_high); 5811 msrs->procbased_ctls_low = 5812 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5813 msrs->procbased_ctls_high &= 5814 CPU_BASED_VIRTUAL_INTR_PENDING | 5815 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 5816 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 5817 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 5818 CPU_BASED_CR3_STORE_EXITING | 5819 #ifdef CONFIG_X86_64 5820 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 5821 #endif 5822 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 5823 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 5824 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 5825 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 5826 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 5827 /* 5828 * We can allow some features even when not supported by the 5829 * hardware. For example, L1 can specify an MSR bitmap - and we 5830 * can use it to avoid exits to L1 - even when L0 runs L2 5831 * without MSR bitmaps. 5832 */ 5833 msrs->procbased_ctls_high |= 5834 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 5835 CPU_BASED_USE_MSR_BITMAPS; 5836 5837 /* We support free control of CR3 access interception. */ 5838 msrs->procbased_ctls_low &= 5839 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 5840 5841 /* 5842 * secondary cpu-based controls. Do not include those that 5843 * depend on CPUID bits, they are added later by vmx_cpuid_update. 5844 */ 5845 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 5846 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 5847 msrs->secondary_ctls_low, 5848 msrs->secondary_ctls_high); 5849 5850 msrs->secondary_ctls_low = 0; 5851 msrs->secondary_ctls_high &= 5852 SECONDARY_EXEC_DESC | 5853 SECONDARY_EXEC_RDTSCP | 5854 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 5855 SECONDARY_EXEC_WBINVD_EXITING | 5856 SECONDARY_EXEC_APIC_REGISTER_VIRT | 5857 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 5858 SECONDARY_EXEC_RDRAND_EXITING | 5859 SECONDARY_EXEC_ENABLE_INVPCID | 5860 SECONDARY_EXEC_RDSEED_EXITING | 5861 SECONDARY_EXEC_XSAVES; 5862 5863 /* 5864 * We can emulate "VMCS shadowing," even if the hardware 5865 * doesn't support it. 5866 */ 5867 msrs->secondary_ctls_high |= 5868 SECONDARY_EXEC_SHADOW_VMCS; 5869 5870 if (enable_ept) { 5871 /* nested EPT: emulate EPT also to L1 */ 5872 msrs->secondary_ctls_high |= 5873 SECONDARY_EXEC_ENABLE_EPT; 5874 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 5875 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; 5876 if (cpu_has_vmx_ept_execute_only()) 5877 msrs->ept_caps |= 5878 VMX_EPT_EXECUTE_ONLY_BIT; 5879 msrs->ept_caps &= ept_caps; 5880 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 5881 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 5882 VMX_EPT_1GB_PAGE_BIT; 5883 if (enable_ept_ad_bits) { 5884 msrs->secondary_ctls_high |= 5885 SECONDARY_EXEC_ENABLE_PML; 5886 msrs->ept_caps |= VMX_EPT_AD_BIT; 5887 } 5888 } 5889 5890 if (cpu_has_vmx_vmfunc()) { 5891 msrs->secondary_ctls_high |= 5892 SECONDARY_EXEC_ENABLE_VMFUNC; 5893 /* 5894 * Advertise EPTP switching unconditionally 5895 * since we emulate it 5896 */ 5897 if (enable_ept) 5898 msrs->vmfunc_controls = 5899 VMX_VMFUNC_EPTP_SWITCHING; 5900 } 5901 5902 /* 5903 * Old versions of KVM use the single-context version without 5904 * checking for support, so declare that it is supported even 5905 * though it is treated as global context. The alternative is 5906 * not failing the single-context invvpid, and it is worse. 5907 */ 5908 if (enable_vpid) { 5909 msrs->secondary_ctls_high |= 5910 SECONDARY_EXEC_ENABLE_VPID; 5911 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 5912 VMX_VPID_EXTENT_SUPPORTED_MASK; 5913 } 5914 5915 if (enable_unrestricted_guest) 5916 msrs->secondary_ctls_high |= 5917 SECONDARY_EXEC_UNRESTRICTED_GUEST; 5918 5919 if (flexpriority_enabled) 5920 msrs->secondary_ctls_high |= 5921 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 5922 5923 /* miscellaneous data */ 5924 rdmsr(MSR_IA32_VMX_MISC, 5925 msrs->misc_low, 5926 msrs->misc_high); 5927 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 5928 msrs->misc_low |= 5929 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 5930 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 5931 VMX_MISC_ACTIVITY_HLT; 5932 msrs->misc_high = 0; 5933 5934 /* 5935 * This MSR reports some information about VMX support. We 5936 * should return information about the VMX we emulate for the 5937 * guest, and the VMCS structure we give it - not about the 5938 * VMX support of the underlying hardware. 5939 */ 5940 msrs->basic = 5941 VMCS12_REVISION | 5942 VMX_BASIC_TRUE_CTLS | 5943 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 5944 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 5945 5946 if (cpu_has_vmx_basic_inout()) 5947 msrs->basic |= VMX_BASIC_INOUT; 5948 5949 /* 5950 * These MSRs specify bits which the guest must keep fixed on 5951 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 5952 * We picked the standard core2 setting. 5953 */ 5954 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 5955 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 5956 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 5957 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 5958 5959 /* These MSRs specify bits which the guest must keep fixed off. */ 5960 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 5961 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 5962 5963 /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 5964 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; 5965 } 5966 5967 void nested_vmx_hardware_unsetup(void) 5968 { 5969 int i; 5970 5971 if (enable_shadow_vmcs) { 5972 for (i = 0; i < VMX_BITMAP_NR; i++) 5973 free_page((unsigned long)vmx_bitmap[i]); 5974 } 5975 } 5976 5977 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 5978 { 5979 int i; 5980 5981 if (!cpu_has_vmx_shadow_vmcs()) 5982 enable_shadow_vmcs = 0; 5983 if (enable_shadow_vmcs) { 5984 for (i = 0; i < VMX_BITMAP_NR; i++) { 5985 /* 5986 * The vmx_bitmap is not tied to a VM and so should 5987 * not be charged to a memcg. 5988 */ 5989 vmx_bitmap[i] = (unsigned long *) 5990 __get_free_page(GFP_KERNEL); 5991 if (!vmx_bitmap[i]) { 5992 nested_vmx_hardware_unsetup(); 5993 return -ENOMEM; 5994 } 5995 } 5996 5997 init_vmcs_shadow_fields(); 5998 } 5999 6000 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, 6001 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 6002 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, 6003 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, 6004 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, 6005 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, 6006 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, 6007 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, 6008 exit_handlers[EXIT_REASON_VMON] = handle_vmon, 6009 exit_handlers[EXIT_REASON_INVEPT] = handle_invept, 6010 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, 6011 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, 6012 6013 kvm_x86_ops->check_nested_events = vmx_check_nested_events; 6014 kvm_x86_ops->get_nested_state = vmx_get_nested_state; 6015 kvm_x86_ops->set_nested_state = vmx_set_nested_state; 6016 kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, 6017 kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; 6018 kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; 6019 6020 return 0; 6021 } 6022