1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/frame.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "trace.h" 14 #include "x86.h" 15 16 static bool __read_mostly enable_shadow_vmcs = 1; 17 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 18 19 static bool __read_mostly nested_early_check = 0; 20 module_param(nested_early_check, bool, S_IRUGO); 21 22 #define CC(consistency_check) \ 23 ({ \ 24 bool failed = (consistency_check); \ 25 if (failed) \ 26 trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ 27 failed; \ 28 }) 29 30 /* 31 * Hyper-V requires all of these, so mark them as supported even though 32 * they are just treated the same as all-context. 33 */ 34 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 35 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 36 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 39 40 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 41 42 enum { 43 VMX_VMREAD_BITMAP, 44 VMX_VMWRITE_BITMAP, 45 VMX_BITMAP_NR 46 }; 47 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 48 49 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 50 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 51 52 struct shadow_vmcs_field { 53 u16 encoding; 54 u16 offset; 55 }; 56 static struct shadow_vmcs_field shadow_read_only_fields[] = { 57 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 58 #include "vmcs_shadow_fields.h" 59 }; 60 static int max_shadow_read_only_fields = 61 ARRAY_SIZE(shadow_read_only_fields); 62 63 static struct shadow_vmcs_field shadow_read_write_fields[] = { 64 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 65 #include "vmcs_shadow_fields.h" 66 }; 67 static int max_shadow_read_write_fields = 68 ARRAY_SIZE(shadow_read_write_fields); 69 70 static void init_vmcs_shadow_fields(void) 71 { 72 int i, j; 73 74 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 75 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 76 77 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 78 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 79 u16 field = entry.encoding; 80 81 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 82 (i + 1 == max_shadow_read_only_fields || 83 shadow_read_only_fields[i + 1].encoding != field + 1)) 84 pr_err("Missing field from shadow_read_only_field %x\n", 85 field + 1); 86 87 clear_bit(field, vmx_vmread_bitmap); 88 if (field & 1) 89 #ifdef CONFIG_X86_64 90 continue; 91 #else 92 entry.offset += sizeof(u32); 93 #endif 94 shadow_read_only_fields[j++] = entry; 95 } 96 max_shadow_read_only_fields = j; 97 98 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 99 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 100 u16 field = entry.encoding; 101 102 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 103 (i + 1 == max_shadow_read_write_fields || 104 shadow_read_write_fields[i + 1].encoding != field + 1)) 105 pr_err("Missing field from shadow_read_write_field %x\n", 106 field + 1); 107 108 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 109 field <= GUEST_TR_AR_BYTES, 110 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 111 112 /* 113 * PML and the preemption timer can be emulated, but the 114 * processor cannot vmwrite to fields that don't exist 115 * on bare metal. 116 */ 117 switch (field) { 118 case GUEST_PML_INDEX: 119 if (!cpu_has_vmx_pml()) 120 continue; 121 break; 122 case VMX_PREEMPTION_TIMER_VALUE: 123 if (!cpu_has_vmx_preemption_timer()) 124 continue; 125 break; 126 case GUEST_INTR_STATUS: 127 if (!cpu_has_vmx_apicv()) 128 continue; 129 break; 130 default: 131 break; 132 } 133 134 clear_bit(field, vmx_vmwrite_bitmap); 135 clear_bit(field, vmx_vmread_bitmap); 136 if (field & 1) 137 #ifdef CONFIG_X86_64 138 continue; 139 #else 140 entry.offset += sizeof(u32); 141 #endif 142 shadow_read_write_fields[j++] = entry; 143 } 144 max_shadow_read_write_fields = j; 145 } 146 147 /* 148 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 149 * set the success or error code of an emulated VMX instruction (as specified 150 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 151 * instruction. 152 */ 153 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 154 { 155 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 156 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 157 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 158 return kvm_skip_emulated_instruction(vcpu); 159 } 160 161 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 162 { 163 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 164 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 165 X86_EFLAGS_SF | X86_EFLAGS_OF)) 166 | X86_EFLAGS_CF); 167 return kvm_skip_emulated_instruction(vcpu); 168 } 169 170 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 171 u32 vm_instruction_error) 172 { 173 struct vcpu_vmx *vmx = to_vmx(vcpu); 174 175 /* 176 * failValid writes the error number to the current VMCS, which 177 * can't be done if there isn't a current VMCS. 178 */ 179 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) 180 return nested_vmx_failInvalid(vcpu); 181 182 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 183 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 184 X86_EFLAGS_SF | X86_EFLAGS_OF)) 185 | X86_EFLAGS_ZF); 186 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 187 /* 188 * We don't need to force a shadow sync because 189 * VM_INSTRUCTION_ERROR is not shadowed 190 */ 191 return kvm_skip_emulated_instruction(vcpu); 192 } 193 194 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 195 { 196 /* TODO: not to reset guest simply here. */ 197 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 198 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 199 } 200 201 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 202 { 203 return fixed_bits_valid(control, low, high); 204 } 205 206 static inline u64 vmx_control_msr(u32 low, u32 high) 207 { 208 return low | ((u64)high << 32); 209 } 210 211 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 212 { 213 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 214 vmcs_write64(VMCS_LINK_POINTER, -1ull); 215 vmx->nested.need_vmcs12_to_shadow_sync = false; 216 } 217 218 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 219 { 220 struct vcpu_vmx *vmx = to_vmx(vcpu); 221 222 if (!vmx->nested.hv_evmcs) 223 return; 224 225 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 226 vmx->nested.hv_evmcs_vmptr = -1ull; 227 vmx->nested.hv_evmcs = NULL; 228 } 229 230 /* 231 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 232 * just stops using VMX. 233 */ 234 static void free_nested(struct kvm_vcpu *vcpu) 235 { 236 struct vcpu_vmx *vmx = to_vmx(vcpu); 237 238 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 239 return; 240 241 kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 242 243 vmx->nested.vmxon = false; 244 vmx->nested.smm.vmxon = false; 245 free_vpid(vmx->nested.vpid02); 246 vmx->nested.posted_intr_nv = -1; 247 vmx->nested.current_vmptr = -1ull; 248 if (enable_shadow_vmcs) { 249 vmx_disable_shadow_vmcs(vmx); 250 vmcs_clear(vmx->vmcs01.shadow_vmcs); 251 free_vmcs(vmx->vmcs01.shadow_vmcs); 252 vmx->vmcs01.shadow_vmcs = NULL; 253 } 254 kfree(vmx->nested.cached_vmcs12); 255 vmx->nested.cached_vmcs12 = NULL; 256 kfree(vmx->nested.cached_shadow_vmcs12); 257 vmx->nested.cached_shadow_vmcs12 = NULL; 258 /* Unpin physical memory we referred to in the vmcs02 */ 259 if (vmx->nested.apic_access_page) { 260 kvm_release_page_dirty(vmx->nested.apic_access_page); 261 vmx->nested.apic_access_page = NULL; 262 } 263 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 264 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 265 vmx->nested.pi_desc = NULL; 266 267 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 268 269 nested_release_evmcs(vcpu); 270 271 free_loaded_vmcs(&vmx->nested.vmcs02); 272 } 273 274 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 275 struct loaded_vmcs *prev) 276 { 277 struct vmcs_host_state *dest, *src; 278 279 if (unlikely(!vmx->guest_state_loaded)) 280 return; 281 282 src = &prev->host_state; 283 dest = &vmx->loaded_vmcs->host_state; 284 285 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 286 dest->ldt_sel = src->ldt_sel; 287 #ifdef CONFIG_X86_64 288 dest->ds_sel = src->ds_sel; 289 dest->es_sel = src->es_sel; 290 #endif 291 } 292 293 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 294 { 295 struct vcpu_vmx *vmx = to_vmx(vcpu); 296 struct loaded_vmcs *prev; 297 int cpu; 298 299 if (vmx->loaded_vmcs == vmcs) 300 return; 301 302 cpu = get_cpu(); 303 prev = vmx->loaded_vmcs; 304 vmx->loaded_vmcs = vmcs; 305 vmx_vcpu_load_vmcs(vcpu, cpu); 306 vmx_sync_vmcs_host_state(vmx, prev); 307 put_cpu(); 308 309 vmx_segment_cache_clear(vmx); 310 } 311 312 /* 313 * Ensure that the current vmcs of the logical processor is the 314 * vmcs01 of the vcpu before calling free_nested(). 315 */ 316 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 317 { 318 vcpu_load(vcpu); 319 vmx_leave_nested(vcpu); 320 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); 321 free_nested(vcpu); 322 vcpu_put(vcpu); 323 } 324 325 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 326 struct x86_exception *fault) 327 { 328 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 329 struct vcpu_vmx *vmx = to_vmx(vcpu); 330 u32 exit_reason; 331 unsigned long exit_qualification = vcpu->arch.exit_qualification; 332 333 if (vmx->nested.pml_full) { 334 exit_reason = EXIT_REASON_PML_FULL; 335 vmx->nested.pml_full = false; 336 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 337 } else if (fault->error_code & PFERR_RSVD_MASK) 338 exit_reason = EXIT_REASON_EPT_MISCONFIG; 339 else 340 exit_reason = EXIT_REASON_EPT_VIOLATION; 341 342 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); 343 vmcs12->guest_physical_address = fault->address; 344 } 345 346 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 347 { 348 WARN_ON(mmu_is_nested(vcpu)); 349 350 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 351 kvm_init_shadow_ept_mmu(vcpu, 352 to_vmx(vcpu)->nested.msrs.ept_caps & 353 VMX_EPT_EXECUTE_ONLY_BIT, 354 nested_ept_ad_enabled(vcpu), 355 nested_ept_get_cr3(vcpu)); 356 vcpu->arch.mmu->set_cr3 = vmx_set_cr3; 357 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; 358 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 359 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 360 361 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 362 } 363 364 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 365 { 366 vcpu->arch.mmu = &vcpu->arch.root_mmu; 367 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 368 } 369 370 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 371 u16 error_code) 372 { 373 bool inequality, bit; 374 375 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 376 inequality = 377 (error_code & vmcs12->page_fault_error_code_mask) != 378 vmcs12->page_fault_error_code_match; 379 return inequality ^ bit; 380 } 381 382 383 /* 384 * KVM wants to inject page-faults which it got to the guest. This function 385 * checks whether in a nested guest, we need to inject them to L1 or L2. 386 */ 387 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 388 { 389 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 390 unsigned int nr = vcpu->arch.exception.nr; 391 bool has_payload = vcpu->arch.exception.has_payload; 392 unsigned long payload = vcpu->arch.exception.payload; 393 394 if (nr == PF_VECTOR) { 395 if (vcpu->arch.exception.nested_apf) { 396 *exit_qual = vcpu->arch.apf.nested_apf_token; 397 return 1; 398 } 399 if (nested_vmx_is_page_fault_vmexit(vmcs12, 400 vcpu->arch.exception.error_code)) { 401 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 402 return 1; 403 } 404 } else if (vmcs12->exception_bitmap & (1u << nr)) { 405 if (nr == DB_VECTOR) { 406 if (!has_payload) { 407 payload = vcpu->arch.dr6; 408 payload &= ~(DR6_FIXED_1 | DR6_BT); 409 payload ^= DR6_RTM; 410 } 411 *exit_qual = payload; 412 } else 413 *exit_qual = 0; 414 return 1; 415 } 416 417 return 0; 418 } 419 420 421 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 422 struct x86_exception *fault) 423 { 424 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 425 426 WARN_ON(!is_guest_mode(vcpu)); 427 428 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 429 !to_vmx(vcpu)->nested.nested_run_pending) { 430 vmcs12->vm_exit_intr_error_code = fault->error_code; 431 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 432 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 433 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 434 fault->address); 435 } else { 436 kvm_inject_page_fault(vcpu, fault); 437 } 438 } 439 440 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) 441 { 442 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); 443 } 444 445 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 446 struct vmcs12 *vmcs12) 447 { 448 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 449 return 0; 450 451 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 452 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 453 return -EINVAL; 454 455 return 0; 456 } 457 458 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 459 struct vmcs12 *vmcs12) 460 { 461 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 462 return 0; 463 464 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 465 return -EINVAL; 466 467 return 0; 468 } 469 470 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 471 struct vmcs12 *vmcs12) 472 { 473 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 474 return 0; 475 476 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 477 return -EINVAL; 478 479 return 0; 480 } 481 482 /* 483 * Check if MSR is intercepted for L01 MSR bitmap. 484 */ 485 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) 486 { 487 unsigned long *msr_bitmap; 488 int f = sizeof(unsigned long); 489 490 if (!cpu_has_vmx_msr_bitmap()) 491 return true; 492 493 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 494 495 if (msr <= 0x1fff) { 496 return !!test_bit(msr, msr_bitmap + 0x800 / f); 497 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 498 msr &= 0x1fff; 499 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 500 } 501 502 return true; 503 } 504 505 /* 506 * If a msr is allowed by L0, we should check whether it is allowed by L1. 507 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 508 */ 509 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 510 unsigned long *msr_bitmap_nested, 511 u32 msr, int type) 512 { 513 int f = sizeof(unsigned long); 514 515 /* 516 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 517 * have the write-low and read-high bitmap offsets the wrong way round. 518 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 519 */ 520 if (msr <= 0x1fff) { 521 if (type & MSR_TYPE_R && 522 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 523 /* read-low */ 524 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 525 526 if (type & MSR_TYPE_W && 527 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 528 /* write-low */ 529 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 530 531 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 532 msr &= 0x1fff; 533 if (type & MSR_TYPE_R && 534 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 535 /* read-high */ 536 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 537 538 if (type & MSR_TYPE_W && 539 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 540 /* write-high */ 541 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 542 543 } 544 } 545 546 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { 547 int msr; 548 549 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 550 unsigned word = msr / BITS_PER_LONG; 551 552 msr_bitmap[word] = ~0; 553 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 554 } 555 } 556 557 /* 558 * Merge L0's and L1's MSR bitmap, return false to indicate that 559 * we do not use the hardware. 560 */ 561 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 562 struct vmcs12 *vmcs12) 563 { 564 int msr; 565 unsigned long *msr_bitmap_l1; 566 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 567 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; 568 569 /* Nothing to do if the MSR bitmap is not in use. */ 570 if (!cpu_has_vmx_msr_bitmap() || 571 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 572 return false; 573 574 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 575 return false; 576 577 msr_bitmap_l1 = (unsigned long *)map->hva; 578 579 /* 580 * To keep the control flow simple, pay eight 8-byte writes (sixteen 581 * 4-byte writes on 32-bit systems) up front to enable intercepts for 582 * the x2APIC MSR range and selectively disable them below. 583 */ 584 enable_x2apic_msr_intercepts(msr_bitmap_l0); 585 586 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 587 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 588 /* 589 * L0 need not intercept reads for MSRs between 0x800 590 * and 0x8ff, it just lets the processor take the value 591 * from the virtual-APIC page; take those 256 bits 592 * directly from the L1 bitmap. 593 */ 594 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 595 unsigned word = msr / BITS_PER_LONG; 596 597 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 598 } 599 } 600 601 nested_vmx_disable_intercept_for_msr( 602 msr_bitmap_l1, msr_bitmap_l0, 603 X2APIC_MSR(APIC_TASKPRI), 604 MSR_TYPE_R | MSR_TYPE_W); 605 606 if (nested_cpu_has_vid(vmcs12)) { 607 nested_vmx_disable_intercept_for_msr( 608 msr_bitmap_l1, msr_bitmap_l0, 609 X2APIC_MSR(APIC_EOI), 610 MSR_TYPE_W); 611 nested_vmx_disable_intercept_for_msr( 612 msr_bitmap_l1, msr_bitmap_l0, 613 X2APIC_MSR(APIC_SELF_IPI), 614 MSR_TYPE_W); 615 } 616 } 617 618 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ 619 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 620 MSR_FS_BASE, MSR_TYPE_RW); 621 622 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 623 MSR_GS_BASE, MSR_TYPE_RW); 624 625 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 626 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 627 628 /* 629 * Checking the L0->L1 bitmap is trying to verify two things: 630 * 631 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This 632 * ensures that we do not accidentally generate an L02 MSR bitmap 633 * from the L12 MSR bitmap that is too permissive. 634 * 2. That L1 or L2s have actually used the MSR. This avoids 635 * unnecessarily merging of the bitmap if the MSR is unused. This 636 * works properly because we only update the L01 MSR bitmap lazily. 637 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only 638 * updated to reflect this when L1 (or its L2s) actually write to 639 * the MSR. 640 */ 641 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) 642 nested_vmx_disable_intercept_for_msr( 643 msr_bitmap_l1, msr_bitmap_l0, 644 MSR_IA32_SPEC_CTRL, 645 MSR_TYPE_R | MSR_TYPE_W); 646 647 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) 648 nested_vmx_disable_intercept_for_msr( 649 msr_bitmap_l1, msr_bitmap_l0, 650 MSR_IA32_PRED_CMD, 651 MSR_TYPE_W); 652 653 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); 654 655 return true; 656 } 657 658 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 659 struct vmcs12 *vmcs12) 660 { 661 struct kvm_host_map map; 662 struct vmcs12 *shadow; 663 664 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 665 vmcs12->vmcs_link_pointer == -1ull) 666 return; 667 668 shadow = get_shadow_vmcs12(vcpu); 669 670 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 671 return; 672 673 memcpy(shadow, map.hva, VMCS12_SIZE); 674 kvm_vcpu_unmap(vcpu, &map, false); 675 } 676 677 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 678 struct vmcs12 *vmcs12) 679 { 680 struct vcpu_vmx *vmx = to_vmx(vcpu); 681 682 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 683 vmcs12->vmcs_link_pointer == -1ull) 684 return; 685 686 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 687 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 688 } 689 690 /* 691 * In nested virtualization, check if L1 has set 692 * VM_EXIT_ACK_INTR_ON_EXIT 693 */ 694 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 695 { 696 return get_vmcs12(vcpu)->vm_exit_controls & 697 VM_EXIT_ACK_INTR_ON_EXIT; 698 } 699 700 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 701 { 702 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); 703 } 704 705 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 706 struct vmcs12 *vmcs12) 707 { 708 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 709 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 710 return -EINVAL; 711 else 712 return 0; 713 } 714 715 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 716 struct vmcs12 *vmcs12) 717 { 718 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 719 !nested_cpu_has_apic_reg_virt(vmcs12) && 720 !nested_cpu_has_vid(vmcs12) && 721 !nested_cpu_has_posted_intr(vmcs12)) 722 return 0; 723 724 /* 725 * If virtualize x2apic mode is enabled, 726 * virtualize apic access must be disabled. 727 */ 728 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 729 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 730 return -EINVAL; 731 732 /* 733 * If virtual interrupt delivery is enabled, 734 * we must exit on external interrupts. 735 */ 736 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 737 return -EINVAL; 738 739 /* 740 * bits 15:8 should be zero in posted_intr_nv, 741 * the descriptor address has been already checked 742 * in nested_get_vmcs12_pages. 743 * 744 * bits 5:0 of posted_intr_desc_addr should be zero. 745 */ 746 if (nested_cpu_has_posted_intr(vmcs12) && 747 (CC(!nested_cpu_has_vid(vmcs12)) || 748 CC(!nested_exit_intr_ack_set(vcpu)) || 749 CC((vmcs12->posted_intr_nv & 0xff00)) || 750 CC((vmcs12->posted_intr_desc_addr & 0x3f)) || 751 CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) 752 return -EINVAL; 753 754 /* tpr shadow is needed by all apicv features. */ 755 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 756 return -EINVAL; 757 758 return 0; 759 } 760 761 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 762 u32 count, u64 addr) 763 { 764 int maxphyaddr; 765 766 if (count == 0) 767 return 0; 768 maxphyaddr = cpuid_maxphyaddr(vcpu); 769 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 770 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) 771 return -EINVAL; 772 773 return 0; 774 } 775 776 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 777 struct vmcs12 *vmcs12) 778 { 779 if (CC(nested_vmx_check_msr_switch(vcpu, 780 vmcs12->vm_exit_msr_load_count, 781 vmcs12->vm_exit_msr_load_addr)) || 782 CC(nested_vmx_check_msr_switch(vcpu, 783 vmcs12->vm_exit_msr_store_count, 784 vmcs12->vm_exit_msr_store_addr))) 785 return -EINVAL; 786 787 return 0; 788 } 789 790 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 791 struct vmcs12 *vmcs12) 792 { 793 if (CC(nested_vmx_check_msr_switch(vcpu, 794 vmcs12->vm_entry_msr_load_count, 795 vmcs12->vm_entry_msr_load_addr))) 796 return -EINVAL; 797 798 return 0; 799 } 800 801 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 802 struct vmcs12 *vmcs12) 803 { 804 if (!nested_cpu_has_pml(vmcs12)) 805 return 0; 806 807 if (CC(!nested_cpu_has_ept(vmcs12)) || 808 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 809 return -EINVAL; 810 811 return 0; 812 } 813 814 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 815 struct vmcs12 *vmcs12) 816 { 817 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 818 !nested_cpu_has_ept(vmcs12))) 819 return -EINVAL; 820 return 0; 821 } 822 823 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 824 struct vmcs12 *vmcs12) 825 { 826 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 827 !nested_cpu_has_ept(vmcs12))) 828 return -EINVAL; 829 return 0; 830 } 831 832 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 833 struct vmcs12 *vmcs12) 834 { 835 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 836 return 0; 837 838 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 839 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 840 return -EINVAL; 841 842 return 0; 843 } 844 845 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 846 struct vmx_msr_entry *e) 847 { 848 /* x2APIC MSR accesses are not allowed */ 849 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 850 return -EINVAL; 851 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 852 CC(e->index == MSR_IA32_UCODE_REV)) 853 return -EINVAL; 854 if (CC(e->reserved != 0)) 855 return -EINVAL; 856 return 0; 857 } 858 859 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 860 struct vmx_msr_entry *e) 861 { 862 if (CC(e->index == MSR_FS_BASE) || 863 CC(e->index == MSR_GS_BASE) || 864 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 865 nested_vmx_msr_check_common(vcpu, e)) 866 return -EINVAL; 867 return 0; 868 } 869 870 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 871 struct vmx_msr_entry *e) 872 { 873 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 874 nested_vmx_msr_check_common(vcpu, e)) 875 return -EINVAL; 876 return 0; 877 } 878 879 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 880 { 881 struct vcpu_vmx *vmx = to_vmx(vcpu); 882 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 883 vmx->nested.msrs.misc_high); 884 885 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 886 } 887 888 /* 889 * Load guest's/host's msr at nested entry/exit. 890 * return 0 for success, entry index for failure. 891 * 892 * One of the failure modes for MSR load/store is when a list exceeds the 893 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 894 * as possible, process all valid entries before failing rather than precheck 895 * for a capacity violation. 896 */ 897 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 898 { 899 u32 i; 900 struct vmx_msr_entry e; 901 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 902 903 for (i = 0; i < count; i++) { 904 if (unlikely(i >= max_msr_list_size)) 905 goto fail; 906 907 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 908 &e, sizeof(e))) { 909 pr_debug_ratelimited( 910 "%s cannot read MSR entry (%u, 0x%08llx)\n", 911 __func__, i, gpa + i * sizeof(e)); 912 goto fail; 913 } 914 if (nested_vmx_load_msr_check(vcpu, &e)) { 915 pr_debug_ratelimited( 916 "%s check failed (%u, 0x%x, 0x%x)\n", 917 __func__, i, e.index, e.reserved); 918 goto fail; 919 } 920 if (kvm_set_msr(vcpu, e.index, e.value)) { 921 pr_debug_ratelimited( 922 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 923 __func__, i, e.index, e.value); 924 goto fail; 925 } 926 } 927 return 0; 928 fail: 929 return i + 1; 930 } 931 932 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 933 { 934 u64 data; 935 u32 i; 936 struct vmx_msr_entry e; 937 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 938 939 for (i = 0; i < count; i++) { 940 if (unlikely(i >= max_msr_list_size)) 941 return -EINVAL; 942 943 if (kvm_vcpu_read_guest(vcpu, 944 gpa + i * sizeof(e), 945 &e, 2 * sizeof(u32))) { 946 pr_debug_ratelimited( 947 "%s cannot read MSR entry (%u, 0x%08llx)\n", 948 __func__, i, gpa + i * sizeof(e)); 949 return -EINVAL; 950 } 951 if (nested_vmx_store_msr_check(vcpu, &e)) { 952 pr_debug_ratelimited( 953 "%s check failed (%u, 0x%x, 0x%x)\n", 954 __func__, i, e.index, e.reserved); 955 return -EINVAL; 956 } 957 if (kvm_get_msr(vcpu, e.index, &data)) { 958 pr_debug_ratelimited( 959 "%s cannot read MSR (%u, 0x%x)\n", 960 __func__, i, e.index); 961 return -EINVAL; 962 } 963 if (kvm_vcpu_write_guest(vcpu, 964 gpa + i * sizeof(e) + 965 offsetof(struct vmx_msr_entry, value), 966 &data, sizeof(data))) { 967 pr_debug_ratelimited( 968 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 969 __func__, i, e.index, data); 970 return -EINVAL; 971 } 972 } 973 return 0; 974 } 975 976 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) 977 { 978 unsigned long invalid_mask; 979 980 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); 981 return (val & invalid_mask) == 0; 982 } 983 984 /* 985 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are 986 * emulating VM entry into a guest with EPT enabled. 987 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 988 * is assigned to entry_failure_code on failure. 989 */ 990 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, 991 u32 *entry_failure_code) 992 { 993 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { 994 if (CC(!nested_cr3_valid(vcpu, cr3))) { 995 *entry_failure_code = ENTRY_FAIL_DEFAULT; 996 return -EINVAL; 997 } 998 999 /* 1000 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1001 * must not be dereferenced. 1002 */ 1003 if (is_pae_paging(vcpu) && !nested_ept) { 1004 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1005 *entry_failure_code = ENTRY_FAIL_PDPTE; 1006 return -EINVAL; 1007 } 1008 } 1009 } 1010 1011 if (!nested_ept) 1012 kvm_mmu_new_cr3(vcpu, cr3, false); 1013 1014 vcpu->arch.cr3 = cr3; 1015 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 1016 1017 kvm_init_mmu(vcpu, false); 1018 1019 return 0; 1020 } 1021 1022 /* 1023 * Returns if KVM is able to config CPU to tag TLB entries 1024 * populated by L2 differently than TLB entries populated 1025 * by L1. 1026 * 1027 * If L1 uses EPT, then TLB entries are tagged with different EPTP. 1028 * 1029 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1030 * with different VPID (L1 entries are tagged with vmx->vpid 1031 * while L2 entries are tagged with vmx->nested.vpid02). 1032 */ 1033 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1034 { 1035 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1036 1037 return nested_cpu_has_ept(vmcs12) || 1038 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1039 } 1040 1041 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) 1042 { 1043 struct vcpu_vmx *vmx = to_vmx(vcpu); 1044 1045 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; 1046 } 1047 1048 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1049 { 1050 superset &= mask; 1051 subset &= mask; 1052 1053 return (superset | subset) == superset; 1054 } 1055 1056 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1057 { 1058 const u64 feature_and_reserved = 1059 /* feature (except bit 48; see below) */ 1060 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1061 /* reserved */ 1062 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1063 u64 vmx_basic = vmx->nested.msrs.basic; 1064 1065 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1066 return -EINVAL; 1067 1068 /* 1069 * KVM does not emulate a version of VMX that constrains physical 1070 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1071 */ 1072 if (data & BIT_ULL(48)) 1073 return -EINVAL; 1074 1075 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1076 vmx_basic_vmcs_revision_id(data)) 1077 return -EINVAL; 1078 1079 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1080 return -EINVAL; 1081 1082 vmx->nested.msrs.basic = data; 1083 return 0; 1084 } 1085 1086 static int 1087 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1088 { 1089 u64 supported; 1090 u32 *lowp, *highp; 1091 1092 switch (msr_index) { 1093 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1094 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1095 highp = &vmx->nested.msrs.pinbased_ctls_high; 1096 break; 1097 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1098 lowp = &vmx->nested.msrs.procbased_ctls_low; 1099 highp = &vmx->nested.msrs.procbased_ctls_high; 1100 break; 1101 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1102 lowp = &vmx->nested.msrs.exit_ctls_low; 1103 highp = &vmx->nested.msrs.exit_ctls_high; 1104 break; 1105 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1106 lowp = &vmx->nested.msrs.entry_ctls_low; 1107 highp = &vmx->nested.msrs.entry_ctls_high; 1108 break; 1109 case MSR_IA32_VMX_PROCBASED_CTLS2: 1110 lowp = &vmx->nested.msrs.secondary_ctls_low; 1111 highp = &vmx->nested.msrs.secondary_ctls_high; 1112 break; 1113 default: 1114 BUG(); 1115 } 1116 1117 supported = vmx_control_msr(*lowp, *highp); 1118 1119 /* Check must-be-1 bits are still 1. */ 1120 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1121 return -EINVAL; 1122 1123 /* Check must-be-0 bits are still 0. */ 1124 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1125 return -EINVAL; 1126 1127 *lowp = data; 1128 *highp = data >> 32; 1129 return 0; 1130 } 1131 1132 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1133 { 1134 const u64 feature_and_reserved_bits = 1135 /* feature */ 1136 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1137 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1138 /* reserved */ 1139 GENMASK_ULL(13, 9) | BIT_ULL(31); 1140 u64 vmx_misc; 1141 1142 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1143 vmx->nested.msrs.misc_high); 1144 1145 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1146 return -EINVAL; 1147 1148 if ((vmx->nested.msrs.pinbased_ctls_high & 1149 PIN_BASED_VMX_PREEMPTION_TIMER) && 1150 vmx_misc_preemption_timer_rate(data) != 1151 vmx_misc_preemption_timer_rate(vmx_misc)) 1152 return -EINVAL; 1153 1154 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1155 return -EINVAL; 1156 1157 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1158 return -EINVAL; 1159 1160 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1161 return -EINVAL; 1162 1163 vmx->nested.msrs.misc_low = data; 1164 vmx->nested.msrs.misc_high = data >> 32; 1165 1166 return 0; 1167 } 1168 1169 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1170 { 1171 u64 vmx_ept_vpid_cap; 1172 1173 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1174 vmx->nested.msrs.vpid_caps); 1175 1176 /* Every bit is either reserved or a feature bit. */ 1177 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1178 return -EINVAL; 1179 1180 vmx->nested.msrs.ept_caps = data; 1181 vmx->nested.msrs.vpid_caps = data >> 32; 1182 return 0; 1183 } 1184 1185 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1186 { 1187 u64 *msr; 1188 1189 switch (msr_index) { 1190 case MSR_IA32_VMX_CR0_FIXED0: 1191 msr = &vmx->nested.msrs.cr0_fixed0; 1192 break; 1193 case MSR_IA32_VMX_CR4_FIXED0: 1194 msr = &vmx->nested.msrs.cr4_fixed0; 1195 break; 1196 default: 1197 BUG(); 1198 } 1199 1200 /* 1201 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1202 * must be 1 in the restored value. 1203 */ 1204 if (!is_bitwise_subset(data, *msr, -1ULL)) 1205 return -EINVAL; 1206 1207 *msr = data; 1208 return 0; 1209 } 1210 1211 /* 1212 * Called when userspace is restoring VMX MSRs. 1213 * 1214 * Returns 0 on success, non-0 otherwise. 1215 */ 1216 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1217 { 1218 struct vcpu_vmx *vmx = to_vmx(vcpu); 1219 1220 /* 1221 * Don't allow changes to the VMX capability MSRs while the vCPU 1222 * is in VMX operation. 1223 */ 1224 if (vmx->nested.vmxon) 1225 return -EBUSY; 1226 1227 switch (msr_index) { 1228 case MSR_IA32_VMX_BASIC: 1229 return vmx_restore_vmx_basic(vmx, data); 1230 case MSR_IA32_VMX_PINBASED_CTLS: 1231 case MSR_IA32_VMX_PROCBASED_CTLS: 1232 case MSR_IA32_VMX_EXIT_CTLS: 1233 case MSR_IA32_VMX_ENTRY_CTLS: 1234 /* 1235 * The "non-true" VMX capability MSRs are generated from the 1236 * "true" MSRs, so we do not support restoring them directly. 1237 * 1238 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1239 * should restore the "true" MSRs with the must-be-1 bits 1240 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1241 * DEFAULT SETTINGS". 1242 */ 1243 return -EINVAL; 1244 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1245 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1246 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1247 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1248 case MSR_IA32_VMX_PROCBASED_CTLS2: 1249 return vmx_restore_control_msr(vmx, msr_index, data); 1250 case MSR_IA32_VMX_MISC: 1251 return vmx_restore_vmx_misc(vmx, data); 1252 case MSR_IA32_VMX_CR0_FIXED0: 1253 case MSR_IA32_VMX_CR4_FIXED0: 1254 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1255 case MSR_IA32_VMX_CR0_FIXED1: 1256 case MSR_IA32_VMX_CR4_FIXED1: 1257 /* 1258 * These MSRs are generated based on the vCPU's CPUID, so we 1259 * do not support restoring them directly. 1260 */ 1261 return -EINVAL; 1262 case MSR_IA32_VMX_EPT_VPID_CAP: 1263 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1264 case MSR_IA32_VMX_VMCS_ENUM: 1265 vmx->nested.msrs.vmcs_enum = data; 1266 return 0; 1267 case MSR_IA32_VMX_VMFUNC: 1268 if (data & ~vmx->nested.msrs.vmfunc_controls) 1269 return -EINVAL; 1270 vmx->nested.msrs.vmfunc_controls = data; 1271 return 0; 1272 default: 1273 /* 1274 * The rest of the VMX capability MSRs do not support restore. 1275 */ 1276 return -EINVAL; 1277 } 1278 } 1279 1280 /* Returns 0 on success, non-0 otherwise. */ 1281 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1282 { 1283 switch (msr_index) { 1284 case MSR_IA32_VMX_BASIC: 1285 *pdata = msrs->basic; 1286 break; 1287 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1288 case MSR_IA32_VMX_PINBASED_CTLS: 1289 *pdata = vmx_control_msr( 1290 msrs->pinbased_ctls_low, 1291 msrs->pinbased_ctls_high); 1292 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1293 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1294 break; 1295 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1296 case MSR_IA32_VMX_PROCBASED_CTLS: 1297 *pdata = vmx_control_msr( 1298 msrs->procbased_ctls_low, 1299 msrs->procbased_ctls_high); 1300 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1301 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1302 break; 1303 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1304 case MSR_IA32_VMX_EXIT_CTLS: 1305 *pdata = vmx_control_msr( 1306 msrs->exit_ctls_low, 1307 msrs->exit_ctls_high); 1308 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1309 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1310 break; 1311 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1312 case MSR_IA32_VMX_ENTRY_CTLS: 1313 *pdata = vmx_control_msr( 1314 msrs->entry_ctls_low, 1315 msrs->entry_ctls_high); 1316 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1317 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1318 break; 1319 case MSR_IA32_VMX_MISC: 1320 *pdata = vmx_control_msr( 1321 msrs->misc_low, 1322 msrs->misc_high); 1323 break; 1324 case MSR_IA32_VMX_CR0_FIXED0: 1325 *pdata = msrs->cr0_fixed0; 1326 break; 1327 case MSR_IA32_VMX_CR0_FIXED1: 1328 *pdata = msrs->cr0_fixed1; 1329 break; 1330 case MSR_IA32_VMX_CR4_FIXED0: 1331 *pdata = msrs->cr4_fixed0; 1332 break; 1333 case MSR_IA32_VMX_CR4_FIXED1: 1334 *pdata = msrs->cr4_fixed1; 1335 break; 1336 case MSR_IA32_VMX_VMCS_ENUM: 1337 *pdata = msrs->vmcs_enum; 1338 break; 1339 case MSR_IA32_VMX_PROCBASED_CTLS2: 1340 *pdata = vmx_control_msr( 1341 msrs->secondary_ctls_low, 1342 msrs->secondary_ctls_high); 1343 break; 1344 case MSR_IA32_VMX_EPT_VPID_CAP: 1345 *pdata = msrs->ept_caps | 1346 ((u64)msrs->vpid_caps << 32); 1347 break; 1348 case MSR_IA32_VMX_VMFUNC: 1349 *pdata = msrs->vmfunc_controls; 1350 break; 1351 default: 1352 return 1; 1353 } 1354 1355 return 0; 1356 } 1357 1358 /* 1359 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1360 * been modified by the L1 guest. Note, "writable" in this context means 1361 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1362 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1363 * VM-exit information fields (which are actually writable if the vCPU is 1364 * configured to support "VMWRITE to any supported field in the VMCS"). 1365 */ 1366 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1367 { 1368 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1369 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1370 struct shadow_vmcs_field field; 1371 unsigned long val; 1372 int i; 1373 1374 if (WARN_ON(!shadow_vmcs)) 1375 return; 1376 1377 preempt_disable(); 1378 1379 vmcs_load(shadow_vmcs); 1380 1381 for (i = 0; i < max_shadow_read_write_fields; i++) { 1382 field = shadow_read_write_fields[i]; 1383 val = __vmcs_readl(field.encoding); 1384 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1385 } 1386 1387 vmcs_clear(shadow_vmcs); 1388 vmcs_load(vmx->loaded_vmcs->vmcs); 1389 1390 preempt_enable(); 1391 } 1392 1393 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1394 { 1395 const struct shadow_vmcs_field *fields[] = { 1396 shadow_read_write_fields, 1397 shadow_read_only_fields 1398 }; 1399 const int max_fields[] = { 1400 max_shadow_read_write_fields, 1401 max_shadow_read_only_fields 1402 }; 1403 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1404 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1405 struct shadow_vmcs_field field; 1406 unsigned long val; 1407 int i, q; 1408 1409 if (WARN_ON(!shadow_vmcs)) 1410 return; 1411 1412 vmcs_load(shadow_vmcs); 1413 1414 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1415 for (i = 0; i < max_fields[q]; i++) { 1416 field = fields[q][i]; 1417 val = vmcs12_read_any(vmcs12, field.encoding, 1418 field.offset); 1419 __vmcs_writel(field.encoding, val); 1420 } 1421 } 1422 1423 vmcs_clear(shadow_vmcs); 1424 vmcs_load(vmx->loaded_vmcs->vmcs); 1425 } 1426 1427 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) 1428 { 1429 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1430 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1431 1432 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1433 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1434 vmcs12->guest_rip = evmcs->guest_rip; 1435 1436 if (unlikely(!(evmcs->hv_clean_fields & 1437 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1438 vmcs12->guest_rsp = evmcs->guest_rsp; 1439 vmcs12->guest_rflags = evmcs->guest_rflags; 1440 vmcs12->guest_interruptibility_info = 1441 evmcs->guest_interruptibility_info; 1442 } 1443 1444 if (unlikely(!(evmcs->hv_clean_fields & 1445 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1446 vmcs12->cpu_based_vm_exec_control = 1447 evmcs->cpu_based_vm_exec_control; 1448 } 1449 1450 if (unlikely(!(evmcs->hv_clean_fields & 1451 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1452 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1453 } 1454 1455 if (unlikely(!(evmcs->hv_clean_fields & 1456 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1457 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1458 } 1459 1460 if (unlikely(!(evmcs->hv_clean_fields & 1461 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1462 vmcs12->vm_entry_intr_info_field = 1463 evmcs->vm_entry_intr_info_field; 1464 vmcs12->vm_entry_exception_error_code = 1465 evmcs->vm_entry_exception_error_code; 1466 vmcs12->vm_entry_instruction_len = 1467 evmcs->vm_entry_instruction_len; 1468 } 1469 1470 if (unlikely(!(evmcs->hv_clean_fields & 1471 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1472 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1473 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1474 vmcs12->host_cr0 = evmcs->host_cr0; 1475 vmcs12->host_cr3 = evmcs->host_cr3; 1476 vmcs12->host_cr4 = evmcs->host_cr4; 1477 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1478 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1479 vmcs12->host_rip = evmcs->host_rip; 1480 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1481 vmcs12->host_es_selector = evmcs->host_es_selector; 1482 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1483 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1484 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1485 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1486 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1487 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1488 } 1489 1490 if (unlikely(!(evmcs->hv_clean_fields & 1491 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1492 vmcs12->pin_based_vm_exec_control = 1493 evmcs->pin_based_vm_exec_control; 1494 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1495 vmcs12->secondary_vm_exec_control = 1496 evmcs->secondary_vm_exec_control; 1497 } 1498 1499 if (unlikely(!(evmcs->hv_clean_fields & 1500 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1501 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1502 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1503 } 1504 1505 if (unlikely(!(evmcs->hv_clean_fields & 1506 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1507 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1508 } 1509 1510 if (unlikely(!(evmcs->hv_clean_fields & 1511 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1512 vmcs12->guest_es_base = evmcs->guest_es_base; 1513 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1514 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1515 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1516 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1517 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1518 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1519 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1520 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1521 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1522 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1523 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1524 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1525 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1526 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1527 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1528 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1529 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1530 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1531 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1532 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1533 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1534 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1535 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1536 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1537 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1538 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1539 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1540 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1541 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1542 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1543 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1544 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1545 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1546 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1547 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1548 } 1549 1550 if (unlikely(!(evmcs->hv_clean_fields & 1551 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1552 vmcs12->tsc_offset = evmcs->tsc_offset; 1553 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1554 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1555 } 1556 1557 if (unlikely(!(evmcs->hv_clean_fields & 1558 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1559 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1560 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1561 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1562 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1563 vmcs12->guest_cr0 = evmcs->guest_cr0; 1564 vmcs12->guest_cr3 = evmcs->guest_cr3; 1565 vmcs12->guest_cr4 = evmcs->guest_cr4; 1566 vmcs12->guest_dr7 = evmcs->guest_dr7; 1567 } 1568 1569 if (unlikely(!(evmcs->hv_clean_fields & 1570 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1571 vmcs12->host_fs_base = evmcs->host_fs_base; 1572 vmcs12->host_gs_base = evmcs->host_gs_base; 1573 vmcs12->host_tr_base = evmcs->host_tr_base; 1574 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1575 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1576 vmcs12->host_rsp = evmcs->host_rsp; 1577 } 1578 1579 if (unlikely(!(evmcs->hv_clean_fields & 1580 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1581 vmcs12->ept_pointer = evmcs->ept_pointer; 1582 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1583 } 1584 1585 if (unlikely(!(evmcs->hv_clean_fields & 1586 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1587 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1588 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1589 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1590 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1591 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1592 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1593 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1594 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1595 vmcs12->guest_pending_dbg_exceptions = 1596 evmcs->guest_pending_dbg_exceptions; 1597 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1598 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1599 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1600 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1601 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1602 } 1603 1604 /* 1605 * Not used? 1606 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1607 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1608 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1609 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; 1610 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; 1611 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; 1612 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; 1613 * vmcs12->page_fault_error_code_mask = 1614 * evmcs->page_fault_error_code_mask; 1615 * vmcs12->page_fault_error_code_match = 1616 * evmcs->page_fault_error_code_match; 1617 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1618 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1619 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1620 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1621 */ 1622 1623 /* 1624 * Read only fields: 1625 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1626 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1627 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1628 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1629 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1630 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1631 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1632 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1633 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1634 * vmcs12->exit_qualification = evmcs->exit_qualification; 1635 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1636 * 1637 * Not present in struct vmcs12: 1638 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1639 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1640 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1641 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1642 */ 1643 1644 return 0; 1645 } 1646 1647 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1648 { 1649 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1650 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1651 1652 /* 1653 * Should not be changed by KVM: 1654 * 1655 * evmcs->host_es_selector = vmcs12->host_es_selector; 1656 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1657 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1658 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1659 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1660 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1661 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1662 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1663 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1664 * evmcs->host_cr0 = vmcs12->host_cr0; 1665 * evmcs->host_cr3 = vmcs12->host_cr3; 1666 * evmcs->host_cr4 = vmcs12->host_cr4; 1667 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1668 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1669 * evmcs->host_rip = vmcs12->host_rip; 1670 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1671 * evmcs->host_fs_base = vmcs12->host_fs_base; 1672 * evmcs->host_gs_base = vmcs12->host_gs_base; 1673 * evmcs->host_tr_base = vmcs12->host_tr_base; 1674 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1675 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1676 * evmcs->host_rsp = vmcs12->host_rsp; 1677 * sync_vmcs02_to_vmcs12() doesn't read these: 1678 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1679 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1680 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1681 * evmcs->ept_pointer = vmcs12->ept_pointer; 1682 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1683 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1684 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1685 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1686 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; 1687 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; 1688 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; 1689 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; 1690 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1691 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1692 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1693 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1694 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1695 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1696 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1697 * evmcs->page_fault_error_code_mask = 1698 * vmcs12->page_fault_error_code_mask; 1699 * evmcs->page_fault_error_code_match = 1700 * vmcs12->page_fault_error_code_match; 1701 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1702 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1703 * evmcs->tsc_offset = vmcs12->tsc_offset; 1704 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1705 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1706 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1707 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1708 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1709 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1710 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1711 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1712 * 1713 * Not present in struct vmcs12: 1714 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1715 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1716 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1717 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1718 */ 1719 1720 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1721 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1722 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1723 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1724 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1725 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1726 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1727 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1728 1729 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1730 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1731 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1732 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1733 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1734 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1735 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1736 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1737 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1738 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1739 1740 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1741 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1742 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1743 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1744 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1745 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1746 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1747 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1748 1749 evmcs->guest_es_base = vmcs12->guest_es_base; 1750 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1751 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1752 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1753 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1754 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1755 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1756 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1757 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1758 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1759 1760 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1761 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1762 1763 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1764 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1765 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1766 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1767 1768 evmcs->guest_pending_dbg_exceptions = 1769 vmcs12->guest_pending_dbg_exceptions; 1770 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1771 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1772 1773 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1774 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1775 1776 evmcs->guest_cr0 = vmcs12->guest_cr0; 1777 evmcs->guest_cr3 = vmcs12->guest_cr3; 1778 evmcs->guest_cr4 = vmcs12->guest_cr4; 1779 evmcs->guest_dr7 = vmcs12->guest_dr7; 1780 1781 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1782 1783 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1784 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1785 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1786 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1787 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1788 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1789 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1790 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1791 1792 evmcs->exit_qualification = vmcs12->exit_qualification; 1793 1794 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1795 evmcs->guest_rsp = vmcs12->guest_rsp; 1796 evmcs->guest_rflags = vmcs12->guest_rflags; 1797 1798 evmcs->guest_interruptibility_info = 1799 vmcs12->guest_interruptibility_info; 1800 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1801 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1802 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1803 evmcs->vm_entry_exception_error_code = 1804 vmcs12->vm_entry_exception_error_code; 1805 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1806 1807 evmcs->guest_rip = vmcs12->guest_rip; 1808 1809 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1810 1811 return 0; 1812 } 1813 1814 /* 1815 * This is an equivalent of the nested hypervisor executing the vmptrld 1816 * instruction. 1817 */ 1818 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, 1819 bool from_launch) 1820 { 1821 struct vcpu_vmx *vmx = to_vmx(vcpu); 1822 bool evmcs_gpa_changed = false; 1823 u64 evmcs_gpa; 1824 1825 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1826 return 1; 1827 1828 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) 1829 return 1; 1830 1831 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1832 if (!vmx->nested.hv_evmcs) 1833 vmx->nested.current_vmptr = -1ull; 1834 1835 nested_release_evmcs(vcpu); 1836 1837 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1838 &vmx->nested.hv_evmcs_map)) 1839 return 0; 1840 1841 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1842 1843 /* 1844 * Currently, KVM only supports eVMCS version 1 1845 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 1846 * value to first u32 field of eVMCS which should specify eVMCS 1847 * VersionNumber. 1848 * 1849 * Guest should be aware of supported eVMCS versions by host by 1850 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 1851 * expected to set this CPUID leaf according to the value 1852 * returned in vmcs_version from nested_enable_evmcs(). 1853 * 1854 * However, it turns out that Microsoft Hyper-V fails to comply 1855 * to their own invented interface: When Hyper-V use eVMCS, it 1856 * just sets first u32 field of eVMCS to revision_id specified 1857 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 1858 * which is one of the supported versions specified in 1859 * CPUID.0x4000000A.EAX[0:15]. 1860 * 1861 * To overcome Hyper-V bug, we accept here either a supported 1862 * eVMCS version or VMCS12 revision_id as valid values for first 1863 * u32 field of eVMCS. 1864 */ 1865 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 1866 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 1867 nested_release_evmcs(vcpu); 1868 return 0; 1869 } 1870 1871 vmx->nested.dirty_vmcs12 = true; 1872 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 1873 1874 evmcs_gpa_changed = true; 1875 /* 1876 * Unlike normal vmcs12, enlightened vmcs12 is not fully 1877 * reloaded from guest's memory (read only fields, fields not 1878 * present in struct hv_enlightened_vmcs, ...). Make sure there 1879 * are no leftovers. 1880 */ 1881 if (from_launch) { 1882 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1883 memset(vmcs12, 0, sizeof(*vmcs12)); 1884 vmcs12->hdr.revision_id = VMCS12_REVISION; 1885 } 1886 1887 } 1888 1889 /* 1890 * Clean fields data can't de used on VMLAUNCH and when we switch 1891 * between different L2 guests as KVM keeps a single VMCS12 per L1. 1892 */ 1893 if (from_launch || evmcs_gpa_changed) 1894 vmx->nested.hv_evmcs->hv_clean_fields &= 1895 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 1896 1897 return 1; 1898 } 1899 1900 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 1901 { 1902 struct vcpu_vmx *vmx = to_vmx(vcpu); 1903 1904 /* 1905 * hv_evmcs may end up being not mapped after migration (when 1906 * L2 was running), map it here to make sure vmcs12 changes are 1907 * properly reflected. 1908 */ 1909 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) 1910 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 1911 1912 if (vmx->nested.hv_evmcs) { 1913 copy_vmcs12_to_enlightened(vmx); 1914 /* All fields are clean */ 1915 vmx->nested.hv_evmcs->hv_clean_fields |= 1916 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 1917 } else { 1918 copy_vmcs12_to_shadow(vmx); 1919 } 1920 1921 vmx->nested.need_vmcs12_to_shadow_sync = false; 1922 } 1923 1924 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 1925 { 1926 struct vcpu_vmx *vmx = 1927 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 1928 1929 vmx->nested.preemption_timer_expired = true; 1930 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 1931 kvm_vcpu_kick(&vmx->vcpu); 1932 1933 return HRTIMER_NORESTART; 1934 } 1935 1936 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 1937 { 1938 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 1939 struct vcpu_vmx *vmx = to_vmx(vcpu); 1940 1941 /* 1942 * A timer value of zero is architecturally guaranteed to cause 1943 * a VMExit prior to executing any instructions in the guest. 1944 */ 1945 if (preemption_timeout == 0) { 1946 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 1947 return; 1948 } 1949 1950 if (vcpu->arch.virtual_tsc_khz == 0) 1951 return; 1952 1953 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 1954 preemption_timeout *= 1000000; 1955 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 1956 hrtimer_start(&vmx->nested.preemption_timer, 1957 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 1958 } 1959 1960 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 1961 { 1962 if (vmx->nested.nested_run_pending && 1963 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 1964 return vmcs12->guest_ia32_efer; 1965 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 1966 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 1967 else 1968 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 1969 } 1970 1971 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 1972 { 1973 /* 1974 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 1975 * according to L0's settings (vmcs12 is irrelevant here). Host 1976 * fields that come from L0 and are not constant, e.g. HOST_CR3, 1977 * will be set as needed prior to VMLAUNCH/VMRESUME. 1978 */ 1979 if (vmx->nested.vmcs02_initialized) 1980 return; 1981 vmx->nested.vmcs02_initialized = true; 1982 1983 /* 1984 * We don't care what the EPTP value is we just need to guarantee 1985 * it's valid so we don't get a false positive when doing early 1986 * consistency checks. 1987 */ 1988 if (enable_ept && nested_early_check) 1989 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); 1990 1991 /* All VMFUNCs are currently emulated through L0 vmexits. */ 1992 if (cpu_has_vmx_vmfunc()) 1993 vmcs_write64(VM_FUNCTION_CONTROL, 0); 1994 1995 if (cpu_has_vmx_posted_intr()) 1996 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 1997 1998 if (cpu_has_vmx_msr_bitmap()) 1999 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2000 2001 /* 2002 * The PML address never changes, so it is constant in vmcs02. 2003 * Conceptually we want to copy the PML index from vmcs01 here, 2004 * and then back to vmcs01 on nested vmexit. But since we flush 2005 * the log and reset GUEST_PML_INDEX on each vmexit, the PML 2006 * index is also effectively constant in vmcs02. 2007 */ 2008 if (enable_pml) { 2009 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 2010 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 2011 } 2012 2013 if (cpu_has_vmx_encls_vmexit()) 2014 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 2015 2016 /* 2017 * Set the MSR load/store lists to match L0's settings. Only the 2018 * addresses are constant (for vmcs02), the counts can change based 2019 * on L2's behavior, e.g. switching to/from long mode. 2020 */ 2021 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 2022 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2023 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2024 2025 vmx_set_constant_host_state(vmx); 2026 } 2027 2028 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2029 struct vmcs12 *vmcs12) 2030 { 2031 prepare_vmcs02_constant_state(vmx); 2032 2033 vmcs_write64(VMCS_LINK_POINTER, -1ull); 2034 2035 if (enable_vpid) { 2036 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2037 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2038 else 2039 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2040 } 2041 } 2042 2043 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2044 { 2045 u32 exec_control, vmcs12_exec_ctrl; 2046 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2047 2048 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) 2049 prepare_vmcs02_early_rare(vmx, vmcs12); 2050 2051 /* 2052 * PIN CONTROLS 2053 */ 2054 exec_control = vmx_pin_based_exec_ctrl(vmx); 2055 exec_control |= (vmcs12->pin_based_vm_exec_control & 2056 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2057 2058 /* Posted interrupts setting is only taken from vmcs12. */ 2059 if (nested_cpu_has_posted_intr(vmcs12)) { 2060 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2061 vmx->nested.pi_pending = false; 2062 } else { 2063 exec_control &= ~PIN_BASED_POSTED_INTR; 2064 } 2065 pin_controls_set(vmx, exec_control); 2066 2067 /* 2068 * EXEC CONTROLS 2069 */ 2070 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2071 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 2072 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 2073 exec_control &= ~CPU_BASED_TPR_SHADOW; 2074 exec_control |= vmcs12->cpu_based_vm_exec_control; 2075 2076 if (exec_control & CPU_BASED_TPR_SHADOW) 2077 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2078 #ifdef CONFIG_X86_64 2079 else 2080 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2081 CPU_BASED_CR8_STORE_EXITING; 2082 #endif 2083 2084 /* 2085 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2086 * for I/O port accesses. 2087 */ 2088 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2089 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2090 2091 /* 2092 * This bit will be computed in nested_get_vmcs12_pages, because 2093 * we do not have access to L1's MSR bitmap yet. For now, keep 2094 * the same bit as before, hoping to avoid multiple VMWRITEs that 2095 * only set/clear this bit. 2096 */ 2097 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2098 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2099 2100 exec_controls_set(vmx, exec_control); 2101 2102 /* 2103 * SECONDARY EXEC CONTROLS 2104 */ 2105 if (cpu_has_secondary_exec_ctrls()) { 2106 exec_control = vmx->secondary_exec_control; 2107 2108 /* Take the following fields only from vmcs12 */ 2109 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2110 SECONDARY_EXEC_ENABLE_INVPCID | 2111 SECONDARY_EXEC_RDTSCP | 2112 SECONDARY_EXEC_XSAVES | 2113 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2114 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2115 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2116 SECONDARY_EXEC_ENABLE_VMFUNC); 2117 if (nested_cpu_has(vmcs12, 2118 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { 2119 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & 2120 ~SECONDARY_EXEC_ENABLE_PML; 2121 exec_control |= vmcs12_exec_ctrl; 2122 } 2123 2124 /* VMCS shadowing for L2 is emulated for now */ 2125 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2126 2127 /* 2128 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2129 * will not have to rewrite the controls just for this bit. 2130 */ 2131 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2132 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2133 exec_control |= SECONDARY_EXEC_DESC; 2134 2135 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2136 vmcs_write16(GUEST_INTR_STATUS, 2137 vmcs12->guest_intr_status); 2138 2139 secondary_exec_controls_set(vmx, exec_control); 2140 } 2141 2142 /* 2143 * ENTRY CONTROLS 2144 * 2145 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2146 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2147 * on the related bits (if supported by the CPU) in the hope that 2148 * we can avoid VMWrites during vmx_set_efer(). 2149 */ 2150 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & 2151 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; 2152 if (cpu_has_load_ia32_efer()) { 2153 if (guest_efer & EFER_LMA) 2154 exec_control |= VM_ENTRY_IA32E_MODE; 2155 if (guest_efer != host_efer) 2156 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2157 } 2158 vm_entry_controls_set(vmx, exec_control); 2159 2160 /* 2161 * EXIT CONTROLS 2162 * 2163 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2164 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2165 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2166 */ 2167 exec_control = vmx_vmexit_ctrl(); 2168 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2169 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2170 vm_exit_controls_set(vmx, exec_control); 2171 2172 /* 2173 * Interrupt/Exception Fields 2174 */ 2175 if (vmx->nested.nested_run_pending) { 2176 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2177 vmcs12->vm_entry_intr_info_field); 2178 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2179 vmcs12->vm_entry_exception_error_code); 2180 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2181 vmcs12->vm_entry_instruction_len); 2182 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2183 vmcs12->guest_interruptibility_info); 2184 vmx->loaded_vmcs->nmi_known_unmasked = 2185 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2186 } else { 2187 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2188 } 2189 } 2190 2191 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2192 { 2193 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2194 2195 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2196 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2197 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2198 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2199 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2200 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2201 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2202 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2203 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2204 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2205 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2206 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2207 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2208 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2209 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2210 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2211 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2212 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2213 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2214 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2215 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2216 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2217 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2218 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2219 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2220 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2221 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2222 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2223 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2224 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2225 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2226 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2227 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2228 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2229 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2230 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2231 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2232 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2233 } 2234 2235 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2236 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2237 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2238 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2239 vmcs12->guest_pending_dbg_exceptions); 2240 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2241 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2242 2243 /* 2244 * L1 may access the L2's PDPTR, so save them to construct 2245 * vmcs12 2246 */ 2247 if (enable_ept) { 2248 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2249 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2250 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2251 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2252 } 2253 2254 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2255 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2256 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2257 } 2258 2259 if (nested_cpu_has_xsaves(vmcs12)) 2260 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2261 2262 /* 2263 * Whether page-faults are trapped is determined by a combination of 2264 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 2265 * If enable_ept, L0 doesn't care about page faults and we should 2266 * set all of these to L1's desires. However, if !enable_ept, L0 does 2267 * care about (at least some) page faults, and because it is not easy 2268 * (if at all possible?) to merge L0 and L1's desires, we simply ask 2269 * to exit on each and every L2 page fault. This is done by setting 2270 * MASK=MATCH=0 and (see below) EB.PF=1. 2271 * Note that below we don't need special code to set EB.PF beyond the 2272 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2273 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2274 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2275 */ 2276 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 2277 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 2278 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 2279 enable_ept ? vmcs12->page_fault_error_code_match : 0); 2280 2281 if (cpu_has_vmx_apicv()) { 2282 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2283 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2284 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2285 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2286 } 2287 2288 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2289 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2290 2291 set_cr4_guest_host_mask(vmx); 2292 } 2293 2294 /* 2295 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2296 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2297 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2298 * guest in a way that will both be appropriate to L1's requests, and our 2299 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2300 * function also has additional necessary side-effects, like setting various 2301 * vcpu->arch fields. 2302 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2303 * is assigned to entry_failure_code on failure. 2304 */ 2305 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2306 u32 *entry_failure_code) 2307 { 2308 struct vcpu_vmx *vmx = to_vmx(vcpu); 2309 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2310 bool load_guest_pdptrs_vmcs12 = false; 2311 2312 if (vmx->nested.dirty_vmcs12 || hv_evmcs) { 2313 prepare_vmcs02_rare(vmx, vmcs12); 2314 vmx->nested.dirty_vmcs12 = false; 2315 2316 load_guest_pdptrs_vmcs12 = !hv_evmcs || 2317 !(hv_evmcs->hv_clean_fields & 2318 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2319 } 2320 2321 if (vmx->nested.nested_run_pending && 2322 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2323 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2324 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2325 } else { 2326 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2327 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2328 } 2329 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2330 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2331 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2332 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2333 2334 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2335 * bitwise-or of what L1 wants to trap for L2, and what we want to 2336 * trap. Note that CR0.TS also needs updating - we do this later. 2337 */ 2338 update_exception_bitmap(vcpu); 2339 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2340 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2341 2342 if (vmx->nested.nested_run_pending && 2343 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2344 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2345 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2346 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2347 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2348 } 2349 2350 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2351 2352 if (kvm_has_tsc_control) 2353 decache_tsc_multiplier(vmx); 2354 2355 if (enable_vpid) { 2356 /* 2357 * There is no direct mapping between vpid02 and vpid12, the 2358 * vpid02 is per-vCPU for L0 and reused while the value of 2359 * vpid12 is changed w/ one invvpid during nested vmentry. 2360 * The vpid12 is allocated by L1 for L2, so it will not 2361 * influence global bitmap(for vpid01 and vpid02 allocation) 2362 * even if spawn a lot of nested vCPUs. 2363 */ 2364 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { 2365 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 2366 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 2367 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); 2368 } 2369 } else { 2370 /* 2371 * If L1 use EPT, then L0 needs to execute INVEPT on 2372 * EPTP02 instead of EPTP01. Therefore, delay TLB 2373 * flush until vmcs02->eptp is fully updated by 2374 * KVM_REQ_LOAD_CR3. Note that this assumes 2375 * KVM_REQ_TLB_FLUSH is evaluated after 2376 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). 2377 */ 2378 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2379 } 2380 } 2381 2382 if (nested_cpu_has_ept(vmcs12)) 2383 nested_ept_init_mmu_context(vcpu); 2384 else if (nested_cpu_has2(vmcs12, 2385 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2386 vmx_flush_tlb(vcpu, true); 2387 2388 /* 2389 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2390 * bits which we consider mandatory enabled. 2391 * The CR0_READ_SHADOW is what L2 should have expected to read given 2392 * the specifications by L1; It's not enough to take 2393 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2394 * have more bits than L1 expected. 2395 */ 2396 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2397 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2398 2399 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2400 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2401 2402 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2403 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2404 vmx_set_efer(vcpu, vcpu->arch.efer); 2405 2406 /* 2407 * Guest state is invalid and unrestricted guest is disabled, 2408 * which means L1 attempted VMEntry to L2 with invalid state. 2409 * Fail the VMEntry. 2410 */ 2411 if (vmx->emulation_required) { 2412 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2413 return -EINVAL; 2414 } 2415 2416 /* Shadow page tables on either EPT or shadow page tables. */ 2417 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2418 entry_failure_code)) 2419 return -EINVAL; 2420 2421 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2422 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2423 is_pae_paging(vcpu)) { 2424 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2425 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2426 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2427 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2428 } 2429 2430 if (!enable_ept) 2431 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2432 2433 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2434 kvm_rip_write(vcpu, vmcs12->guest_rip); 2435 return 0; 2436 } 2437 2438 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2439 { 2440 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2441 nested_cpu_has_virtual_nmis(vmcs12))) 2442 return -EINVAL; 2443 2444 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2445 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) 2446 return -EINVAL; 2447 2448 return 0; 2449 } 2450 2451 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) 2452 { 2453 struct vcpu_vmx *vmx = to_vmx(vcpu); 2454 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2455 2456 /* Check for memory type validity */ 2457 switch (address & VMX_EPTP_MT_MASK) { 2458 case VMX_EPTP_MT_UC: 2459 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2460 return false; 2461 break; 2462 case VMX_EPTP_MT_WB: 2463 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2464 return false; 2465 break; 2466 default: 2467 return false; 2468 } 2469 2470 /* only 4 levels page-walk length are valid */ 2471 if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) 2472 return false; 2473 2474 /* Reserved bits should not be set */ 2475 if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) 2476 return false; 2477 2478 /* AD, if set, should be supported */ 2479 if (address & VMX_EPTP_AD_ENABLE_BIT) { 2480 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2481 return false; 2482 } 2483 2484 return true; 2485 } 2486 2487 /* 2488 * Checks related to VM-Execution Control Fields 2489 */ 2490 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2491 struct vmcs12 *vmcs12) 2492 { 2493 struct vcpu_vmx *vmx = to_vmx(vcpu); 2494 2495 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2496 vmx->nested.msrs.pinbased_ctls_low, 2497 vmx->nested.msrs.pinbased_ctls_high)) || 2498 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2499 vmx->nested.msrs.procbased_ctls_low, 2500 vmx->nested.msrs.procbased_ctls_high))) 2501 return -EINVAL; 2502 2503 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2504 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2505 vmx->nested.msrs.secondary_ctls_low, 2506 vmx->nested.msrs.secondary_ctls_high))) 2507 return -EINVAL; 2508 2509 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2510 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2511 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2512 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2513 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2514 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2515 nested_vmx_check_nmi_controls(vmcs12) || 2516 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2517 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2518 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2519 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2520 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2521 return -EINVAL; 2522 2523 if (!nested_cpu_has_preemption_timer(vmcs12) && 2524 nested_cpu_has_save_preemption_timer(vmcs12)) 2525 return -EINVAL; 2526 2527 if (nested_cpu_has_ept(vmcs12) && 2528 CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) 2529 return -EINVAL; 2530 2531 if (nested_cpu_has_vmfunc(vmcs12)) { 2532 if (CC(vmcs12->vm_function_control & 2533 ~vmx->nested.msrs.vmfunc_controls)) 2534 return -EINVAL; 2535 2536 if (nested_cpu_has_eptp_switching(vmcs12)) { 2537 if (CC(!nested_cpu_has_ept(vmcs12)) || 2538 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2539 return -EINVAL; 2540 } 2541 } 2542 2543 return 0; 2544 } 2545 2546 /* 2547 * Checks related to VM-Exit Control Fields 2548 */ 2549 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2550 struct vmcs12 *vmcs12) 2551 { 2552 struct vcpu_vmx *vmx = to_vmx(vcpu); 2553 2554 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2555 vmx->nested.msrs.exit_ctls_low, 2556 vmx->nested.msrs.exit_ctls_high)) || 2557 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2558 return -EINVAL; 2559 2560 return 0; 2561 } 2562 2563 /* 2564 * Checks related to VM-Entry Control Fields 2565 */ 2566 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2567 struct vmcs12 *vmcs12) 2568 { 2569 struct vcpu_vmx *vmx = to_vmx(vcpu); 2570 2571 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2572 vmx->nested.msrs.entry_ctls_low, 2573 vmx->nested.msrs.entry_ctls_high))) 2574 return -EINVAL; 2575 2576 /* 2577 * From the Intel SDM, volume 3: 2578 * Fields relevant to VM-entry event injection must be set properly. 2579 * These fields are the VM-entry interruption-information field, the 2580 * VM-entry exception error code, and the VM-entry instruction length. 2581 */ 2582 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2583 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2584 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2585 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2586 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2587 bool should_have_error_code; 2588 bool urg = nested_cpu_has2(vmcs12, 2589 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2590 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2591 2592 /* VM-entry interruption-info field: interruption type */ 2593 if (CC(intr_type == INTR_TYPE_RESERVED) || 2594 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2595 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2596 return -EINVAL; 2597 2598 /* VM-entry interruption-info field: vector */ 2599 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2600 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2601 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2602 return -EINVAL; 2603 2604 /* VM-entry interruption-info field: deliver error code */ 2605 should_have_error_code = 2606 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2607 x86_exception_has_error_code(vector); 2608 if (CC(has_error_code != should_have_error_code)) 2609 return -EINVAL; 2610 2611 /* VM-entry exception error code */ 2612 if (CC(has_error_code && 2613 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2614 return -EINVAL; 2615 2616 /* VM-entry interruption-info field: reserved bits */ 2617 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2618 return -EINVAL; 2619 2620 /* VM-entry instruction length */ 2621 switch (intr_type) { 2622 case INTR_TYPE_SOFT_EXCEPTION: 2623 case INTR_TYPE_SOFT_INTR: 2624 case INTR_TYPE_PRIV_SW_EXCEPTION: 2625 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2626 CC(vmcs12->vm_entry_instruction_len == 0 && 2627 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2628 return -EINVAL; 2629 } 2630 } 2631 2632 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2633 return -EINVAL; 2634 2635 return 0; 2636 } 2637 2638 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2639 struct vmcs12 *vmcs12) 2640 { 2641 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2642 nested_check_vm_exit_controls(vcpu, vmcs12) || 2643 nested_check_vm_entry_controls(vcpu, vmcs12)) 2644 return -EINVAL; 2645 2646 return 0; 2647 } 2648 2649 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2650 struct vmcs12 *vmcs12) 2651 { 2652 bool ia32e; 2653 2654 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2655 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2656 CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) 2657 return -EINVAL; 2658 2659 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2660 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2661 return -EINVAL; 2662 2663 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2664 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2665 return -EINVAL; 2666 2667 #ifdef CONFIG_X86_64 2668 ia32e = !!(vcpu->arch.efer & EFER_LMA); 2669 #else 2670 ia32e = false; 2671 #endif 2672 2673 if (ia32e) { 2674 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || 2675 CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2676 return -EINVAL; 2677 } else { 2678 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || 2679 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2680 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2681 CC((vmcs12->host_rip) >> 32)) 2682 return -EINVAL; 2683 } 2684 2685 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2686 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2687 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2688 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2689 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2690 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2691 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2692 CC(vmcs12->host_cs_selector == 0) || 2693 CC(vmcs12->host_tr_selector == 0) || 2694 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2695 return -EINVAL; 2696 2697 #ifdef CONFIG_X86_64 2698 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2699 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2700 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2701 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2702 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2703 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2704 return -EINVAL; 2705 #endif 2706 2707 /* 2708 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2709 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2710 * the values of the LMA and LME bits in the field must each be that of 2711 * the host address-space size VM-exit control. 2712 */ 2713 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2714 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2715 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2716 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2717 return -EINVAL; 2718 } 2719 2720 return 0; 2721 } 2722 2723 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2724 struct vmcs12 *vmcs12) 2725 { 2726 int r = 0; 2727 struct vmcs12 *shadow; 2728 struct kvm_host_map map; 2729 2730 if (vmcs12->vmcs_link_pointer == -1ull) 2731 return 0; 2732 2733 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2734 return -EINVAL; 2735 2736 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2737 return -EINVAL; 2738 2739 shadow = map.hva; 2740 2741 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2742 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2743 r = -EINVAL; 2744 2745 kvm_vcpu_unmap(vcpu, &map, false); 2746 return r; 2747 } 2748 2749 /* 2750 * Checks related to Guest Non-register State 2751 */ 2752 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2753 { 2754 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2755 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) 2756 return -EINVAL; 2757 2758 return 0; 2759 } 2760 2761 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2762 struct vmcs12 *vmcs12, 2763 u32 *exit_qual) 2764 { 2765 bool ia32e; 2766 2767 *exit_qual = ENTRY_FAIL_DEFAULT; 2768 2769 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2770 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2771 return -EINVAL; 2772 2773 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2774 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2775 return -EINVAL; 2776 2777 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2778 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; 2779 return -EINVAL; 2780 } 2781 2782 /* 2783 * If the load IA32_EFER VM-entry control is 1, the following checks 2784 * are performed on the field for the IA32_EFER MSR: 2785 * - Bits reserved in the IA32_EFER MSR must be 0. 2786 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2787 * the IA-32e mode guest VM-exit control. It must also be identical 2788 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2789 * CR0.PG) is 1. 2790 */ 2791 if (to_vmx(vcpu)->nested.nested_run_pending && 2792 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2793 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2794 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2795 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2796 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2797 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2798 return -EINVAL; 2799 } 2800 2801 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2802 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 2803 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 2804 return -EINVAL; 2805 2806 if (nested_check_guest_non_reg_state(vmcs12)) 2807 return -EINVAL; 2808 2809 return 0; 2810 } 2811 2812 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 2813 { 2814 struct vcpu_vmx *vmx = to_vmx(vcpu); 2815 unsigned long cr3, cr4; 2816 bool vm_fail; 2817 2818 if (!nested_early_check) 2819 return 0; 2820 2821 if (vmx->msr_autoload.host.nr) 2822 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 2823 if (vmx->msr_autoload.guest.nr) 2824 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 2825 2826 preempt_disable(); 2827 2828 vmx_prepare_switch_to_guest(vcpu); 2829 2830 /* 2831 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 2832 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 2833 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. 2834 * there is no need to preserve other bits or save/restore the field. 2835 */ 2836 vmcs_writel(GUEST_RFLAGS, 0); 2837 2838 cr3 = __get_current_cr3_fast(); 2839 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 2840 vmcs_writel(HOST_CR3, cr3); 2841 vmx->loaded_vmcs->host_state.cr3 = cr3; 2842 } 2843 2844 cr4 = cr4_read_shadow(); 2845 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 2846 vmcs_writel(HOST_CR4, cr4); 2847 vmx->loaded_vmcs->host_state.cr4 = cr4; 2848 } 2849 2850 asm( 2851 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ 2852 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2853 "je 1f \n\t" 2854 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" 2855 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2856 "1: \n\t" 2857 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ 2858 2859 /* Check if vmlaunch or vmresume is needed */ 2860 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" 2861 2862 /* 2863 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set 2864 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail 2865 * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the 2866 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. 2867 */ 2868 "call vmx_vmenter\n\t" 2869 2870 CC_SET(be) 2871 : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) 2872 : [HOST_RSP]"r"((unsigned long)HOST_RSP), 2873 [loaded_vmcs]"r"(vmx->loaded_vmcs), 2874 [launched]"i"(offsetof(struct loaded_vmcs, launched)), 2875 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), 2876 [wordsize]"i"(sizeof(ulong)) 2877 : "memory" 2878 ); 2879 2880 if (vmx->msr_autoload.host.nr) 2881 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2882 if (vmx->msr_autoload.guest.nr) 2883 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2884 2885 if (vm_fail) { 2886 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 2887 2888 preempt_enable(); 2889 2890 trace_kvm_nested_vmenter_failed( 2891 "early hardware check VM-instruction error: ", error); 2892 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 2893 return 1; 2894 } 2895 2896 /* 2897 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 2898 */ 2899 local_irq_enable(); 2900 if (hw_breakpoint_active()) 2901 set_debugreg(__this_cpu_read(cpu_dr7), 7); 2902 preempt_enable(); 2903 2904 /* 2905 * A non-failing VMEntry means we somehow entered guest mode with 2906 * an illegal RIP, and that's just the tip of the iceberg. There 2907 * is no telling what memory has been modified or what state has 2908 * been exposed to unknown code. Hitting this all but guarantees 2909 * a (very critical) hardware issue. 2910 */ 2911 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 2912 VMX_EXIT_REASONS_FAILED_VMENTRY)); 2913 2914 return 0; 2915 } 2916 2917 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 2918 struct vmcs12 *vmcs12); 2919 2920 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 2921 { 2922 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2923 struct vcpu_vmx *vmx = to_vmx(vcpu); 2924 struct kvm_host_map *map; 2925 struct page *page; 2926 u64 hpa; 2927 2928 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 2929 /* 2930 * Translate L1 physical address to host physical 2931 * address for vmcs02. Keep the page pinned, so this 2932 * physical address remains valid. We keep a reference 2933 * to it so we can release it later. 2934 */ 2935 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 2936 kvm_release_page_dirty(vmx->nested.apic_access_page); 2937 vmx->nested.apic_access_page = NULL; 2938 } 2939 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 2940 if (!is_error_page(page)) { 2941 vmx->nested.apic_access_page = page; 2942 hpa = page_to_phys(vmx->nested.apic_access_page); 2943 vmcs_write64(APIC_ACCESS_ADDR, hpa); 2944 } else { 2945 pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", 2946 __func__); 2947 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2948 vcpu->run->internal.suberror = 2949 KVM_INTERNAL_ERROR_EMULATION; 2950 vcpu->run->internal.ndata = 0; 2951 return false; 2952 } 2953 } 2954 2955 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 2956 map = &vmx->nested.virtual_apic_map; 2957 2958 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 2959 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 2960 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 2961 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 2962 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 2963 /* 2964 * The processor will never use the TPR shadow, simply 2965 * clear the bit from the execution control. Such a 2966 * configuration is useless, but it happens in tests. 2967 * For any other configuration, failing the vm entry is 2968 * _not_ what the processor does but it's basically the 2969 * only possibility we have. 2970 */ 2971 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 2972 } else { 2973 /* 2974 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 2975 * force VM-Entry to fail. 2976 */ 2977 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 2978 } 2979 } 2980 2981 if (nested_cpu_has_posted_intr(vmcs12)) { 2982 map = &vmx->nested.pi_desc_map; 2983 2984 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 2985 vmx->nested.pi_desc = 2986 (struct pi_desc *)(((void *)map->hva) + 2987 offset_in_page(vmcs12->posted_intr_desc_addr)); 2988 vmcs_write64(POSTED_INTR_DESC_ADDR, 2989 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 2990 } 2991 } 2992 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 2993 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 2994 else 2995 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 2996 return true; 2997 } 2998 2999 /* 3000 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3001 * for running VMX instructions (except VMXON, whose prerequisites are 3002 * slightly different). It also specifies what exception to inject otherwise. 3003 * Note that many of these exceptions have priority over VM exits, so they 3004 * don't have to be checked again here. 3005 */ 3006 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3007 { 3008 if (!to_vmx(vcpu)->nested.vmxon) { 3009 kvm_queue_exception(vcpu, UD_VECTOR); 3010 return 0; 3011 } 3012 3013 if (vmx_get_cpl(vcpu)) { 3014 kvm_inject_gp(vcpu, 0); 3015 return 0; 3016 } 3017 3018 return 1; 3019 } 3020 3021 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3022 { 3023 u8 rvi = vmx_get_rvi(); 3024 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3025 3026 return ((rvi & 0xf0) > (vppr & 0xf0)); 3027 } 3028 3029 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3030 struct vmcs12 *vmcs12); 3031 3032 /* 3033 * If from_vmentry is false, this is being called from state restore (either RSM 3034 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3035 * 3036 * Returns: 3037 * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode 3038 * NVMX_ENTRY_VMFAIL: Consistency check VMFail 3039 * NVMX_ENTRY_VMEXIT: Consistency check VMExit 3040 * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error 3041 */ 3042 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3043 bool from_vmentry) 3044 { 3045 struct vcpu_vmx *vmx = to_vmx(vcpu); 3046 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3047 bool evaluate_pending_interrupts; 3048 u32 exit_reason = EXIT_REASON_INVALID_STATE; 3049 u32 exit_qual; 3050 3051 evaluate_pending_interrupts = exec_controls_get(vmx) & 3052 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); 3053 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3054 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3055 3056 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3057 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3058 if (kvm_mpx_supported() && 3059 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3060 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3061 3062 /* 3063 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3064 * nested early checks are disabled. In the event of a "late" VM-Fail, 3065 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3066 * software model to the pre-VMEntry host state. When EPT is disabled, 3067 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3068 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3069 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3070 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3071 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3072 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3073 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3074 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3075 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3076 * path would need to manually save/restore vmcs01.GUEST_CR3. 3077 */ 3078 if (!enable_ept && !nested_early_check) 3079 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3080 3081 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3082 3083 prepare_vmcs02_early(vmx, vmcs12); 3084 3085 if (from_vmentry) { 3086 if (unlikely(!nested_get_vmcs12_pages(vcpu))) 3087 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3088 3089 if (nested_vmx_check_vmentry_hw(vcpu)) { 3090 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3091 return NVMX_VMENTRY_VMFAIL; 3092 } 3093 3094 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 3095 goto vmentry_fail_vmexit; 3096 } 3097 3098 enter_guest_mode(vcpu); 3099 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3100 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 3101 3102 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) 3103 goto vmentry_fail_vmexit_guest_mode; 3104 3105 if (from_vmentry) { 3106 exit_reason = EXIT_REASON_MSR_LOAD_FAIL; 3107 exit_qual = nested_vmx_load_msr(vcpu, 3108 vmcs12->vm_entry_msr_load_addr, 3109 vmcs12->vm_entry_msr_load_count); 3110 if (exit_qual) 3111 goto vmentry_fail_vmexit_guest_mode; 3112 } else { 3113 /* 3114 * The MMU is not initialized to point at the right entities yet and 3115 * "get pages" would need to read data from the guest (i.e. we will 3116 * need to perform gpa to hpa translation). Request a call 3117 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3118 * have already been set at vmentry time and should not be reset. 3119 */ 3120 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 3121 } 3122 3123 /* 3124 * If L1 had a pending IRQ/NMI until it executed 3125 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3126 * disallowed (e.g. interrupts disabled), L0 needs to 3127 * evaluate if this pending event should cause an exit from L2 3128 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3129 * intercept EXTERNAL_INTERRUPT). 3130 * 3131 * Usually this would be handled by the processor noticing an 3132 * IRQ/NMI window request, or checking RVI during evaluation of 3133 * pending virtual interrupts. However, this setting was done 3134 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3135 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3136 */ 3137 if (unlikely(evaluate_pending_interrupts)) 3138 kvm_make_request(KVM_REQ_EVENT, vcpu); 3139 3140 /* 3141 * Do not start the preemption timer hrtimer until after we know 3142 * we are successful, so that only nested_vmx_vmexit needs to cancel 3143 * the timer. 3144 */ 3145 vmx->nested.preemption_timer_expired = false; 3146 if (nested_cpu_has_preemption_timer(vmcs12)) 3147 vmx_start_preemption_timer(vcpu); 3148 3149 /* 3150 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3151 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3152 * returned as far as L1 is concerned. It will only return (and set 3153 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3154 */ 3155 return NVMX_VMENTRY_SUCCESS; 3156 3157 /* 3158 * A failed consistency check that leads to a VMExit during L1's 3159 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3160 * 26.7 "VM-entry failures during or after loading guest state". 3161 */ 3162 vmentry_fail_vmexit_guest_mode: 3163 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3164 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3165 leave_guest_mode(vcpu); 3166 3167 vmentry_fail_vmexit: 3168 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3169 3170 if (!from_vmentry) 3171 return NVMX_VMENTRY_VMEXIT; 3172 3173 load_vmcs12_host_state(vcpu, vmcs12); 3174 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 3175 vmcs12->exit_qualification = exit_qual; 3176 if (enable_shadow_vmcs || vmx->nested.hv_evmcs) 3177 vmx->nested.need_vmcs12_to_shadow_sync = true; 3178 return NVMX_VMENTRY_VMEXIT; 3179 } 3180 3181 /* 3182 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3183 * for running an L2 nested guest. 3184 */ 3185 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3186 { 3187 struct vmcs12 *vmcs12; 3188 enum nvmx_vmentry_status status; 3189 struct vcpu_vmx *vmx = to_vmx(vcpu); 3190 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3191 3192 if (!nested_vmx_check_permission(vcpu)) 3193 return 1; 3194 3195 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) 3196 return 1; 3197 3198 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) 3199 return nested_vmx_failInvalid(vcpu); 3200 3201 vmcs12 = get_vmcs12(vcpu); 3202 3203 /* 3204 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3205 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3206 * rather than RFLAGS.ZF, and no error number is stored to the 3207 * VM-instruction error field. 3208 */ 3209 if (vmcs12->hdr.shadow_vmcs) 3210 return nested_vmx_failInvalid(vcpu); 3211 3212 if (vmx->nested.hv_evmcs) { 3213 copy_enlightened_to_vmcs12(vmx); 3214 /* Enlightened VMCS doesn't have launch state */ 3215 vmcs12->launch_state = !launch; 3216 } else if (enable_shadow_vmcs) { 3217 copy_shadow_to_vmcs12(vmx); 3218 } 3219 3220 /* 3221 * The nested entry process starts with enforcing various prerequisites 3222 * on vmcs12 as required by the Intel SDM, and act appropriately when 3223 * they fail: As the SDM explains, some conditions should cause the 3224 * instruction to fail, while others will cause the instruction to seem 3225 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3226 * To speed up the normal (success) code path, we should avoid checking 3227 * for misconfigurations which will anyway be caught by the processor 3228 * when using the merged vmcs02. 3229 */ 3230 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) 3231 return nested_vmx_failValid(vcpu, 3232 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3233 3234 if (vmcs12->launch_state == launch) 3235 return nested_vmx_failValid(vcpu, 3236 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3237 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3238 3239 if (nested_vmx_check_controls(vcpu, vmcs12)) 3240 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3241 3242 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3243 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3244 3245 /* 3246 * We're finally done with prerequisite checking, and can start with 3247 * the nested entry. 3248 */ 3249 vmx->nested.nested_run_pending = 1; 3250 status = nested_vmx_enter_non_root_mode(vcpu, true); 3251 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3252 goto vmentry_failed; 3253 3254 /* Hide L1D cache contents from the nested guest. */ 3255 vmx->vcpu.arch.l1tf_flush_l1d = true; 3256 3257 /* 3258 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3259 * also be used as part of restoring nVMX state for 3260 * snapshot restore (migration). 3261 * 3262 * In this flow, it is assumed that vmcs12 cache was 3263 * trasferred as part of captured nVMX state and should 3264 * therefore not be read from guest memory (which may not 3265 * exist on destination host yet). 3266 */ 3267 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3268 3269 /* 3270 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3271 * awakened by event injection or by an NMI-window VM-exit or 3272 * by an interrupt-window VM-exit, halt the vcpu. 3273 */ 3274 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && 3275 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3276 !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && 3277 !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && 3278 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3279 vmx->nested.nested_run_pending = 0; 3280 return kvm_vcpu_halt(vcpu); 3281 } 3282 return 1; 3283 3284 vmentry_failed: 3285 vmx->nested.nested_run_pending = 0; 3286 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3287 return 0; 3288 if (status == NVMX_VMENTRY_VMEXIT) 3289 return 1; 3290 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3291 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3292 } 3293 3294 /* 3295 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3296 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 3297 * This function returns the new value we should put in vmcs12.guest_cr0. 3298 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3299 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3300 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3301 * didn't trap the bit, because if L1 did, so would L0). 3302 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3303 * been modified by L2, and L1 knows it. So just leave the old value of 3304 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3305 * isn't relevant, because if L0 traps this bit it can set it to anything. 3306 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3307 * changed these bits, and therefore they need to be updated, but L0 3308 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3309 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3310 */ 3311 static inline unsigned long 3312 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3313 { 3314 return 3315 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3316 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3317 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3318 vcpu->arch.cr0_guest_owned_bits)); 3319 } 3320 3321 static inline unsigned long 3322 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3323 { 3324 return 3325 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3326 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3327 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3328 vcpu->arch.cr4_guest_owned_bits)); 3329 } 3330 3331 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3332 struct vmcs12 *vmcs12) 3333 { 3334 u32 idt_vectoring; 3335 unsigned int nr; 3336 3337 if (vcpu->arch.exception.injected) { 3338 nr = vcpu->arch.exception.nr; 3339 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3340 3341 if (kvm_exception_is_soft(nr)) { 3342 vmcs12->vm_exit_instruction_len = 3343 vcpu->arch.event_exit_inst_len; 3344 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3345 } else 3346 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3347 3348 if (vcpu->arch.exception.has_error_code) { 3349 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3350 vmcs12->idt_vectoring_error_code = 3351 vcpu->arch.exception.error_code; 3352 } 3353 3354 vmcs12->idt_vectoring_info_field = idt_vectoring; 3355 } else if (vcpu->arch.nmi_injected) { 3356 vmcs12->idt_vectoring_info_field = 3357 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3358 } else if (vcpu->arch.interrupt.injected) { 3359 nr = vcpu->arch.interrupt.nr; 3360 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3361 3362 if (vcpu->arch.interrupt.soft) { 3363 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3364 vmcs12->vm_entry_instruction_len = 3365 vcpu->arch.event_exit_inst_len; 3366 } else 3367 idt_vectoring |= INTR_TYPE_EXT_INTR; 3368 3369 vmcs12->idt_vectoring_info_field = idt_vectoring; 3370 } 3371 } 3372 3373 3374 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3375 { 3376 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3377 gfn_t gfn; 3378 3379 /* 3380 * Don't need to mark the APIC access page dirty; it is never 3381 * written to by the CPU during APIC virtualization. 3382 */ 3383 3384 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3385 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3386 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3387 } 3388 3389 if (nested_cpu_has_posted_intr(vmcs12)) { 3390 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3391 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3392 } 3393 } 3394 3395 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3396 { 3397 struct vcpu_vmx *vmx = to_vmx(vcpu); 3398 int max_irr; 3399 void *vapic_page; 3400 u16 status; 3401 3402 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) 3403 return; 3404 3405 vmx->nested.pi_pending = false; 3406 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3407 return; 3408 3409 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3410 if (max_irr != 256) { 3411 vapic_page = vmx->nested.virtual_apic_map.hva; 3412 if (!vapic_page) 3413 return; 3414 3415 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3416 vapic_page, &max_irr); 3417 status = vmcs_read16(GUEST_INTR_STATUS); 3418 if ((u8)max_irr > ((u8)status & 0xff)) { 3419 status &= ~0xff; 3420 status |= (u8)max_irr; 3421 vmcs_write16(GUEST_INTR_STATUS, status); 3422 } 3423 } 3424 3425 nested_mark_vmcs12_pages_dirty(vcpu); 3426 } 3427 3428 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3429 unsigned long exit_qual) 3430 { 3431 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3432 unsigned int nr = vcpu->arch.exception.nr; 3433 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3434 3435 if (vcpu->arch.exception.has_error_code) { 3436 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3437 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3438 } 3439 3440 if (kvm_exception_is_soft(nr)) 3441 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3442 else 3443 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3444 3445 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3446 vmx_get_nmi_mask(vcpu)) 3447 intr_info |= INTR_INFO_UNBLOCK_NMI; 3448 3449 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3450 } 3451 3452 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 3453 { 3454 struct vcpu_vmx *vmx = to_vmx(vcpu); 3455 unsigned long exit_qual; 3456 bool block_nested_events = 3457 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3458 struct kvm_lapic *apic = vcpu->arch.apic; 3459 3460 if (lapic_in_kernel(vcpu) && 3461 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3462 if (block_nested_events) 3463 return -EBUSY; 3464 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3465 return 0; 3466 } 3467 3468 if (vcpu->arch.exception.pending && 3469 nested_vmx_check_exception(vcpu, &exit_qual)) { 3470 if (block_nested_events) 3471 return -EBUSY; 3472 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3473 return 0; 3474 } 3475 3476 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3477 vmx->nested.preemption_timer_expired) { 3478 if (block_nested_events) 3479 return -EBUSY; 3480 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3481 return 0; 3482 } 3483 3484 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 3485 if (block_nested_events) 3486 return -EBUSY; 3487 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3488 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3489 INTR_INFO_VALID_MASK, 0); 3490 /* 3491 * The NMI-triggered VM exit counts as injection: 3492 * clear this one and block further NMIs. 3493 */ 3494 vcpu->arch.nmi_pending = 0; 3495 vmx_set_nmi_mask(vcpu, true); 3496 return 0; 3497 } 3498 3499 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && 3500 nested_exit_on_intr(vcpu)) { 3501 if (block_nested_events) 3502 return -EBUSY; 3503 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3504 return 0; 3505 } 3506 3507 vmx_complete_nested_posted_interrupt(vcpu); 3508 return 0; 3509 } 3510 3511 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3512 { 3513 ktime_t remaining = 3514 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3515 u64 value; 3516 3517 if (ktime_to_ns(remaining) <= 0) 3518 return 0; 3519 3520 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3521 do_div(value, 1000000); 3522 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3523 } 3524 3525 static bool is_vmcs12_ext_field(unsigned long field) 3526 { 3527 switch (field) { 3528 case GUEST_ES_SELECTOR: 3529 case GUEST_CS_SELECTOR: 3530 case GUEST_SS_SELECTOR: 3531 case GUEST_DS_SELECTOR: 3532 case GUEST_FS_SELECTOR: 3533 case GUEST_GS_SELECTOR: 3534 case GUEST_LDTR_SELECTOR: 3535 case GUEST_TR_SELECTOR: 3536 case GUEST_ES_LIMIT: 3537 case GUEST_CS_LIMIT: 3538 case GUEST_SS_LIMIT: 3539 case GUEST_DS_LIMIT: 3540 case GUEST_FS_LIMIT: 3541 case GUEST_GS_LIMIT: 3542 case GUEST_LDTR_LIMIT: 3543 case GUEST_TR_LIMIT: 3544 case GUEST_GDTR_LIMIT: 3545 case GUEST_IDTR_LIMIT: 3546 case GUEST_ES_AR_BYTES: 3547 case GUEST_DS_AR_BYTES: 3548 case GUEST_FS_AR_BYTES: 3549 case GUEST_GS_AR_BYTES: 3550 case GUEST_LDTR_AR_BYTES: 3551 case GUEST_TR_AR_BYTES: 3552 case GUEST_ES_BASE: 3553 case GUEST_CS_BASE: 3554 case GUEST_SS_BASE: 3555 case GUEST_DS_BASE: 3556 case GUEST_FS_BASE: 3557 case GUEST_GS_BASE: 3558 case GUEST_LDTR_BASE: 3559 case GUEST_TR_BASE: 3560 case GUEST_GDTR_BASE: 3561 case GUEST_IDTR_BASE: 3562 case GUEST_PENDING_DBG_EXCEPTIONS: 3563 case GUEST_BNDCFGS: 3564 return true; 3565 default: 3566 break; 3567 } 3568 3569 return false; 3570 } 3571 3572 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3573 struct vmcs12 *vmcs12) 3574 { 3575 struct vcpu_vmx *vmx = to_vmx(vcpu); 3576 3577 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3578 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3579 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3580 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3581 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3582 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3583 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 3584 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 3585 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 3586 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 3587 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 3588 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 3589 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 3590 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 3591 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 3592 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 3593 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3594 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3595 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3596 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 3597 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 3598 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 3599 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 3600 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 3601 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 3602 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 3603 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 3604 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 3605 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 3606 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 3607 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 3608 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 3609 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 3610 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 3611 vmcs12->guest_pending_dbg_exceptions = 3612 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 3613 if (kvm_mpx_supported()) 3614 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3615 3616 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 3617 } 3618 3619 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3620 struct vmcs12 *vmcs12) 3621 { 3622 struct vcpu_vmx *vmx = to_vmx(vcpu); 3623 int cpu; 3624 3625 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 3626 return; 3627 3628 3629 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 3630 3631 cpu = get_cpu(); 3632 vmx->loaded_vmcs = &vmx->nested.vmcs02; 3633 vmx_vcpu_load(&vmx->vcpu, cpu); 3634 3635 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3636 3637 vmx->loaded_vmcs = &vmx->vmcs01; 3638 vmx_vcpu_load(&vmx->vcpu, cpu); 3639 put_cpu(); 3640 } 3641 3642 /* 3643 * Update the guest state fields of vmcs12 to reflect changes that 3644 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 3645 * VM-entry controls is also updated, since this is really a guest 3646 * state bit.) 3647 */ 3648 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3649 { 3650 struct vcpu_vmx *vmx = to_vmx(vcpu); 3651 3652 if (vmx->nested.hv_evmcs) 3653 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3654 3655 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; 3656 3657 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 3658 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 3659 3660 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 3661 vmcs12->guest_rip = kvm_rip_read(vcpu); 3662 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 3663 3664 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 3665 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 3666 3667 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 3668 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 3669 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 3670 3671 vmcs12->guest_interruptibility_info = 3672 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 3673 3674 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3675 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 3676 else 3677 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 3678 3679 if (nested_cpu_has_preemption_timer(vmcs12) && 3680 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) 3681 vmcs12->vmx_preemption_timer_value = 3682 vmx_get_preemption_timer_value(vcpu); 3683 3684 /* 3685 * In some cases (usually, nested EPT), L2 is allowed to change its 3686 * own CR3 without exiting. If it has changed it, we must keep it. 3687 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 3688 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 3689 * 3690 * Additionally, restore L2's PDPTR to vmcs12. 3691 */ 3692 if (enable_ept) { 3693 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 3694 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3695 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 3696 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 3697 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 3698 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 3699 } 3700 } 3701 3702 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 3703 3704 if (nested_cpu_has_vid(vmcs12)) 3705 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 3706 3707 vmcs12->vm_entry_controls = 3708 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 3709 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 3710 3711 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 3712 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 3713 3714 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 3715 vmcs12->guest_ia32_efer = vcpu->arch.efer; 3716 } 3717 3718 /* 3719 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 3720 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 3721 * and this function updates it to reflect the changes to the guest state while 3722 * L2 was running (and perhaps made some exits which were handled directly by L0 3723 * without going back to L1), and to reflect the exit reason. 3724 * Note that we do not have to copy here all VMCS fields, just those that 3725 * could have changed by the L2 guest or the exit - i.e., the guest-state and 3726 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 3727 * which already writes to vmcs12 directly. 3728 */ 3729 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 3730 u32 exit_reason, u32 exit_intr_info, 3731 unsigned long exit_qualification) 3732 { 3733 /* update exit information fields: */ 3734 vmcs12->vm_exit_reason = exit_reason; 3735 vmcs12->exit_qualification = exit_qualification; 3736 vmcs12->vm_exit_intr_info = exit_intr_info; 3737 3738 vmcs12->idt_vectoring_info_field = 0; 3739 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3740 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 3741 3742 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 3743 vmcs12->launch_state = 1; 3744 3745 /* vm_entry_intr_info_field is cleared on exit. Emulate this 3746 * instead of reading the real value. */ 3747 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 3748 3749 /* 3750 * Transfer the event that L0 or L1 may wanted to inject into 3751 * L2 to IDT_VECTORING_INFO_FIELD. 3752 */ 3753 vmcs12_save_pending_event(vcpu, vmcs12); 3754 3755 /* 3756 * According to spec, there's no need to store the guest's 3757 * MSRs if the exit is due to a VM-entry failure that occurs 3758 * during or after loading the guest state. Since this exit 3759 * does not fall in that category, we need to save the MSRs. 3760 */ 3761 if (nested_vmx_store_msr(vcpu, 3762 vmcs12->vm_exit_msr_store_addr, 3763 vmcs12->vm_exit_msr_store_count)) 3764 nested_vmx_abort(vcpu, 3765 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 3766 } 3767 3768 /* 3769 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 3770 * preserved above and would only end up incorrectly in L1. 3771 */ 3772 vcpu->arch.nmi_injected = false; 3773 kvm_clear_exception_queue(vcpu); 3774 kvm_clear_interrupt_queue(vcpu); 3775 } 3776 3777 /* 3778 * A part of what we need to when the nested L2 guest exits and we want to 3779 * run its L1 parent, is to reset L1's guest state to the host state specified 3780 * in vmcs12. 3781 * This function is to be called not only on normal nested exit, but also on 3782 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 3783 * Failures During or After Loading Guest State"). 3784 * This function should be called when the active VMCS is L1's (vmcs01). 3785 */ 3786 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3787 struct vmcs12 *vmcs12) 3788 { 3789 struct kvm_segment seg; 3790 u32 entry_failure_code; 3791 3792 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 3793 vcpu->arch.efer = vmcs12->host_ia32_efer; 3794 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 3795 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 3796 else 3797 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 3798 vmx_set_efer(vcpu, vcpu->arch.efer); 3799 3800 kvm_rsp_write(vcpu, vmcs12->host_rsp); 3801 kvm_rip_write(vcpu, vmcs12->host_rip); 3802 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 3803 vmx_set_interrupt_shadow(vcpu, 0); 3804 3805 /* 3806 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 3807 * actually changed, because vmx_set_cr0 refers to efer set above. 3808 * 3809 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 3810 * (KVM doesn't change it); 3811 */ 3812 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 3813 vmx_set_cr0(vcpu, vmcs12->host_cr0); 3814 3815 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 3816 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 3817 vmx_set_cr4(vcpu, vmcs12->host_cr4); 3818 3819 nested_ept_uninit_mmu_context(vcpu); 3820 3821 /* 3822 * Only PDPTE load can fail as the value of cr3 was checked on entry and 3823 * couldn't have changed. 3824 */ 3825 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) 3826 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 3827 3828 if (!enable_ept) 3829 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 3830 3831 /* 3832 * If vmcs01 doesn't use VPID, CPU flushes TLB on every 3833 * VMEntry/VMExit. Thus, no need to flush TLB. 3834 * 3835 * If vmcs12 doesn't use VPID, L1 expects TLB to be 3836 * flushed on every VMEntry/VMExit. 3837 * 3838 * Otherwise, we can preserve TLB entries as long as we are 3839 * able to tag L1 TLB entries differently than L2 TLB entries. 3840 * 3841 * If vmcs12 uses EPT, we need to execute this flush on EPTP01 3842 * and therefore we request the TLB flush to happen only after VMCS EPTP 3843 * has been set by KVM_REQ_LOAD_CR3. 3844 */ 3845 if (enable_vpid && 3846 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { 3847 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 3848 } 3849 3850 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 3851 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 3852 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 3853 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 3854 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 3855 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 3856 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 3857 3858 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 3859 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 3860 vmcs_write64(GUEST_BNDCFGS, 0); 3861 3862 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 3863 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 3864 vcpu->arch.pat = vmcs12->host_ia32_pat; 3865 } 3866 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 3867 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 3868 vmcs12->host_ia32_perf_global_ctrl); 3869 3870 /* Set L1 segment info according to Intel SDM 3871 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 3872 seg = (struct kvm_segment) { 3873 .base = 0, 3874 .limit = 0xFFFFFFFF, 3875 .selector = vmcs12->host_cs_selector, 3876 .type = 11, 3877 .present = 1, 3878 .s = 1, 3879 .g = 1 3880 }; 3881 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 3882 seg.l = 1; 3883 else 3884 seg.db = 1; 3885 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 3886 seg = (struct kvm_segment) { 3887 .base = 0, 3888 .limit = 0xFFFFFFFF, 3889 .type = 3, 3890 .present = 1, 3891 .s = 1, 3892 .db = 1, 3893 .g = 1 3894 }; 3895 seg.selector = vmcs12->host_ds_selector; 3896 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 3897 seg.selector = vmcs12->host_es_selector; 3898 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 3899 seg.selector = vmcs12->host_ss_selector; 3900 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 3901 seg.selector = vmcs12->host_fs_selector; 3902 seg.base = vmcs12->host_fs_base; 3903 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 3904 seg.selector = vmcs12->host_gs_selector; 3905 seg.base = vmcs12->host_gs_base; 3906 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 3907 seg = (struct kvm_segment) { 3908 .base = vmcs12->host_tr_base, 3909 .limit = 0x67, 3910 .selector = vmcs12->host_tr_selector, 3911 .type = 11, 3912 .present = 1 3913 }; 3914 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 3915 3916 kvm_set_dr(vcpu, 7, 0x400); 3917 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 3918 3919 if (cpu_has_vmx_msr_bitmap()) 3920 vmx_update_msr_bitmap(vcpu); 3921 3922 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 3923 vmcs12->vm_exit_msr_load_count)) 3924 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 3925 } 3926 3927 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 3928 { 3929 struct shared_msr_entry *efer_msr; 3930 unsigned int i; 3931 3932 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 3933 return vmcs_read64(GUEST_IA32_EFER); 3934 3935 if (cpu_has_load_ia32_efer()) 3936 return host_efer; 3937 3938 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 3939 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 3940 return vmx->msr_autoload.guest.val[i].value; 3941 } 3942 3943 efer_msr = find_msr_entry(vmx, MSR_EFER); 3944 if (efer_msr) 3945 return efer_msr->data; 3946 3947 return host_efer; 3948 } 3949 3950 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 3951 { 3952 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3953 struct vcpu_vmx *vmx = to_vmx(vcpu); 3954 struct vmx_msr_entry g, h; 3955 gpa_t gpa; 3956 u32 i, j; 3957 3958 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 3959 3960 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 3961 /* 3962 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 3963 * as vmcs01.GUEST_DR7 contains a userspace defined value 3964 * and vcpu->arch.dr7 is not squirreled away before the 3965 * nested VMENTER (not worth adding a variable in nested_vmx). 3966 */ 3967 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 3968 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 3969 else 3970 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 3971 } 3972 3973 /* 3974 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 3975 * handle a variety of side effects to KVM's software model. 3976 */ 3977 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 3978 3979 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 3980 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 3981 3982 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 3983 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 3984 3985 nested_ept_uninit_mmu_context(vcpu); 3986 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3987 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 3988 3989 /* 3990 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 3991 * from vmcs01 (if necessary). The PDPTRs are not loaded on 3992 * VMFail, like everything else we just need to ensure our 3993 * software model is up-to-date. 3994 */ 3995 if (enable_ept) 3996 ept_save_pdptrs(vcpu); 3997 3998 kvm_mmu_reset_context(vcpu); 3999 4000 if (cpu_has_vmx_msr_bitmap()) 4001 vmx_update_msr_bitmap(vcpu); 4002 4003 /* 4004 * This nasty bit of open coding is a compromise between blindly 4005 * loading L1's MSRs using the exit load lists (incorrect emulation 4006 * of VMFail), leaving the nested VM's MSRs in the software model 4007 * (incorrect behavior) and snapshotting the modified MSRs (too 4008 * expensive since the lists are unbound by hardware). For each 4009 * MSR that was (prematurely) loaded from the nested VMEntry load 4010 * list, reload it from the exit load list if it exists and differs 4011 * from the guest value. The intent is to stuff host state as 4012 * silently as possible, not to fully process the exit load list. 4013 */ 4014 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4015 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4016 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4017 pr_debug_ratelimited( 4018 "%s read MSR index failed (%u, 0x%08llx)\n", 4019 __func__, i, gpa); 4020 goto vmabort; 4021 } 4022 4023 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4024 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4025 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4026 pr_debug_ratelimited( 4027 "%s read MSR failed (%u, 0x%08llx)\n", 4028 __func__, j, gpa); 4029 goto vmabort; 4030 } 4031 if (h.index != g.index) 4032 continue; 4033 if (h.value == g.value) 4034 break; 4035 4036 if (nested_vmx_load_msr_check(vcpu, &h)) { 4037 pr_debug_ratelimited( 4038 "%s check failed (%u, 0x%x, 0x%x)\n", 4039 __func__, j, h.index, h.reserved); 4040 goto vmabort; 4041 } 4042 4043 if (kvm_set_msr(vcpu, h.index, h.value)) { 4044 pr_debug_ratelimited( 4045 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4046 __func__, j, h.index, h.value); 4047 goto vmabort; 4048 } 4049 } 4050 } 4051 4052 return; 4053 4054 vmabort: 4055 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4056 } 4057 4058 /* 4059 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4060 * and modify vmcs12 to make it see what it would expect to see there if 4061 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4062 */ 4063 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 4064 u32 exit_intr_info, unsigned long exit_qualification) 4065 { 4066 struct vcpu_vmx *vmx = to_vmx(vcpu); 4067 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4068 4069 /* trying to cancel vmlaunch/vmresume is a bug */ 4070 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4071 4072 leave_guest_mode(vcpu); 4073 4074 if (nested_cpu_has_preemption_timer(vmcs12)) 4075 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4076 4077 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 4078 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4079 4080 if (likely(!vmx->fail)) { 4081 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4082 4083 if (exit_reason != -1) 4084 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 4085 exit_qualification); 4086 4087 /* 4088 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4089 * also be used to capture vmcs12 cache as part of 4090 * capturing nVMX state for snapshot (migration). 4091 * 4092 * Otherwise, this flush will dirty guest memory at a 4093 * point it is already assumed by user-space to be 4094 * immutable. 4095 */ 4096 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4097 } else { 4098 /* 4099 * The only expected VM-instruction error is "VM entry with 4100 * invalid control field(s)." Anything else indicates a 4101 * problem with L0. And we should never get here with a 4102 * VMFail of any type if early consistency checks are enabled. 4103 */ 4104 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4105 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4106 WARN_ON_ONCE(nested_early_check); 4107 } 4108 4109 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4110 4111 /* Update any VMCS fields that might have changed while L2 ran */ 4112 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4113 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4114 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4115 4116 if (kvm_has_tsc_control) 4117 decache_tsc_multiplier(vmx); 4118 4119 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4120 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4121 vmx_set_virtual_apic_mode(vcpu); 4122 } else if (!nested_cpu_has_ept(vmcs12) && 4123 nested_cpu_has2(vmcs12, 4124 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 4125 vmx_flush_tlb(vcpu, true); 4126 } 4127 4128 /* Unpin physical memory we referred to in vmcs02 */ 4129 if (vmx->nested.apic_access_page) { 4130 kvm_release_page_dirty(vmx->nested.apic_access_page); 4131 vmx->nested.apic_access_page = NULL; 4132 } 4133 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4134 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4135 vmx->nested.pi_desc = NULL; 4136 4137 /* 4138 * We are now running in L2, mmu_notifier will force to reload the 4139 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. 4140 */ 4141 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4142 4143 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) 4144 vmx->nested.need_vmcs12_to_shadow_sync = true; 4145 4146 /* in case we halted in L2 */ 4147 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4148 4149 if (likely(!vmx->fail)) { 4150 /* 4151 * TODO: SDM says that with acknowledge interrupt on 4152 * exit, bit 31 of the VM-exit interrupt information 4153 * (valid interrupt) is always set to 1 on 4154 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't 4155 * need kvm_cpu_has_interrupt(). See the commit 4156 * message for details. 4157 */ 4158 if (nested_exit_intr_ack_set(vcpu) && 4159 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4160 kvm_cpu_has_interrupt(vcpu)) { 4161 int irq = kvm_cpu_get_interrupt(vcpu); 4162 WARN_ON(irq < 0); 4163 vmcs12->vm_exit_intr_info = irq | 4164 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4165 } 4166 4167 if (exit_reason != -1) 4168 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4169 vmcs12->exit_qualification, 4170 vmcs12->idt_vectoring_info_field, 4171 vmcs12->vm_exit_intr_info, 4172 vmcs12->vm_exit_intr_error_code, 4173 KVM_ISA_VMX); 4174 4175 load_vmcs12_host_state(vcpu, vmcs12); 4176 4177 return; 4178 } 4179 4180 /* 4181 * After an early L2 VM-entry failure, we're now back 4182 * in L1 which thinks it just finished a VMLAUNCH or 4183 * VMRESUME instruction, so we need to set the failure 4184 * flag and the VM-instruction error field of the VMCS 4185 * accordingly, and skip the emulated instruction. 4186 */ 4187 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4188 4189 /* 4190 * Restore L1's host state to KVM's software model. We're here 4191 * because a consistency check was caught by hardware, which 4192 * means some amount of guest state has been propagated to KVM's 4193 * model and needs to be unwound to the host's state. 4194 */ 4195 nested_vmx_restore_host_state(vcpu); 4196 4197 vmx->fail = 0; 4198 } 4199 4200 /* 4201 * Decode the memory-address operand of a vmx instruction, as recorded on an 4202 * exit caused by such an instruction (run by a guest hypervisor). 4203 * On success, returns 0. When the operand is invalid, returns 1 and throws 4204 * #UD or #GP. 4205 */ 4206 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4207 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4208 { 4209 gva_t off; 4210 bool exn; 4211 struct kvm_segment s; 4212 4213 /* 4214 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4215 * Execution", on an exit, vmx_instruction_info holds most of the 4216 * addressing components of the operand. Only the displacement part 4217 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4218 * For how an actual address is calculated from all these components, 4219 * refer to Vol. 1, "Operand Addressing". 4220 */ 4221 int scaling = vmx_instruction_info & 3; 4222 int addr_size = (vmx_instruction_info >> 7) & 7; 4223 bool is_reg = vmx_instruction_info & (1u << 10); 4224 int seg_reg = (vmx_instruction_info >> 15) & 7; 4225 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4226 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4227 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4228 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4229 4230 if (is_reg) { 4231 kvm_queue_exception(vcpu, UD_VECTOR); 4232 return 1; 4233 } 4234 4235 /* Addr = segment_base + offset */ 4236 /* offset = base + [index * scale] + displacement */ 4237 off = exit_qualification; /* holds the displacement */ 4238 if (addr_size == 1) 4239 off = (gva_t)sign_extend64(off, 31); 4240 else if (addr_size == 0) 4241 off = (gva_t)sign_extend64(off, 15); 4242 if (base_is_valid) 4243 off += kvm_register_read(vcpu, base_reg); 4244 if (index_is_valid) 4245 off += kvm_register_read(vcpu, index_reg)<<scaling; 4246 vmx_get_segment(vcpu, &s, seg_reg); 4247 4248 /* 4249 * The effective address, i.e. @off, of a memory operand is truncated 4250 * based on the address size of the instruction. Note that this is 4251 * the *effective address*, i.e. the address prior to accounting for 4252 * the segment's base. 4253 */ 4254 if (addr_size == 1) /* 32 bit */ 4255 off &= 0xffffffff; 4256 else if (addr_size == 0) /* 16 bit */ 4257 off &= 0xffff; 4258 4259 /* Checks for #GP/#SS exceptions. */ 4260 exn = false; 4261 if (is_long_mode(vcpu)) { 4262 /* 4263 * The virtual/linear address is never truncated in 64-bit 4264 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4265 * address when using FS/GS with a non-zero base. 4266 */ 4267 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4268 *ret = s.base + off; 4269 else 4270 *ret = off; 4271 4272 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4273 * non-canonical form. This is the only check on the memory 4274 * destination for long mode! 4275 */ 4276 exn = is_noncanonical_address(*ret, vcpu); 4277 } else { 4278 /* 4279 * When not in long mode, the virtual/linear address is 4280 * unconditionally truncated to 32 bits regardless of the 4281 * address size. 4282 */ 4283 *ret = (s.base + off) & 0xffffffff; 4284 4285 /* Protected mode: apply checks for segment validity in the 4286 * following order: 4287 * - segment type check (#GP(0) may be thrown) 4288 * - usability check (#GP(0)/#SS(0)) 4289 * - limit check (#GP(0)/#SS(0)) 4290 */ 4291 if (wr) 4292 /* #GP(0) if the destination operand is located in a 4293 * read-only data segment or any code segment. 4294 */ 4295 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4296 else 4297 /* #GP(0) if the source operand is located in an 4298 * execute-only code segment 4299 */ 4300 exn = ((s.type & 0xa) == 8); 4301 if (exn) { 4302 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4303 return 1; 4304 } 4305 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4306 */ 4307 exn = (s.unusable != 0); 4308 4309 /* 4310 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4311 * outside the segment limit. All CPUs that support VMX ignore 4312 * limit checks for flat segments, i.e. segments with base==0, 4313 * limit==0xffffffff and of type expand-up data or code. 4314 */ 4315 if (!(s.base == 0 && s.limit == 0xffffffff && 4316 ((s.type & 8) || !(s.type & 4)))) 4317 exn = exn || ((u64)off + len - 1 > s.limit); 4318 } 4319 if (exn) { 4320 kvm_queue_exception_e(vcpu, 4321 seg_reg == VCPU_SREG_SS ? 4322 SS_VECTOR : GP_VECTOR, 4323 0); 4324 return 1; 4325 } 4326 4327 return 0; 4328 } 4329 4330 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) 4331 { 4332 gva_t gva; 4333 struct x86_exception e; 4334 4335 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4336 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4337 sizeof(*vmpointer), &gva)) 4338 return 1; 4339 4340 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { 4341 kvm_inject_page_fault(vcpu, &e); 4342 return 1; 4343 } 4344 4345 return 0; 4346 } 4347 4348 /* 4349 * Allocate a shadow VMCS and associate it with the currently loaded 4350 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4351 * VMCS is also VMCLEARed, so that it is ready for use. 4352 */ 4353 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4354 { 4355 struct vcpu_vmx *vmx = to_vmx(vcpu); 4356 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4357 4358 /* 4359 * We should allocate a shadow vmcs for vmcs01 only when L1 4360 * executes VMXON and free it when L1 executes VMXOFF. 4361 * As it is invalid to execute VMXON twice, we shouldn't reach 4362 * here when vmcs01 already have an allocated shadow vmcs. 4363 */ 4364 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4365 4366 if (!loaded_vmcs->shadow_vmcs) { 4367 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4368 if (loaded_vmcs->shadow_vmcs) 4369 vmcs_clear(loaded_vmcs->shadow_vmcs); 4370 } 4371 return loaded_vmcs->shadow_vmcs; 4372 } 4373 4374 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4375 { 4376 struct vcpu_vmx *vmx = to_vmx(vcpu); 4377 int r; 4378 4379 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4380 if (r < 0) 4381 goto out_vmcs02; 4382 4383 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4384 if (!vmx->nested.cached_vmcs12) 4385 goto out_cached_vmcs12; 4386 4387 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4388 if (!vmx->nested.cached_shadow_vmcs12) 4389 goto out_cached_shadow_vmcs12; 4390 4391 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4392 goto out_shadow_vmcs; 4393 4394 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4395 HRTIMER_MODE_REL_PINNED); 4396 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4397 4398 vmx->nested.vpid02 = allocate_vpid(); 4399 4400 vmx->nested.vmcs02_initialized = false; 4401 vmx->nested.vmxon = true; 4402 4403 if (pt_mode == PT_MODE_HOST_GUEST) { 4404 vmx->pt_desc.guest.ctl = 0; 4405 pt_update_intercept_for_msr(vmx); 4406 } 4407 4408 return 0; 4409 4410 out_shadow_vmcs: 4411 kfree(vmx->nested.cached_shadow_vmcs12); 4412 4413 out_cached_shadow_vmcs12: 4414 kfree(vmx->nested.cached_vmcs12); 4415 4416 out_cached_vmcs12: 4417 free_loaded_vmcs(&vmx->nested.vmcs02); 4418 4419 out_vmcs02: 4420 return -ENOMEM; 4421 } 4422 4423 /* 4424 * Emulate the VMXON instruction. 4425 * Currently, we just remember that VMX is active, and do not save or even 4426 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4427 * do not currently need to store anything in that guest-allocated memory 4428 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4429 * argument is different from the VMXON pointer (which the spec says they do). 4430 */ 4431 static int handle_vmon(struct kvm_vcpu *vcpu) 4432 { 4433 int ret; 4434 gpa_t vmptr; 4435 uint32_t revision; 4436 struct vcpu_vmx *vmx = to_vmx(vcpu); 4437 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 4438 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 4439 4440 /* 4441 * The Intel VMX Instruction Reference lists a bunch of bits that are 4442 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4443 * 1 (see vmx_set_cr4() for when we allow the guest to set this). 4444 * Otherwise, we should fail with #UD. But most faulting conditions 4445 * have already been checked by hardware, prior to the VM-exit for 4446 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4447 * that bit set to 1 in non-root mode. 4448 */ 4449 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4450 kvm_queue_exception(vcpu, UD_VECTOR); 4451 return 1; 4452 } 4453 4454 /* CPL=0 must be checked manually. */ 4455 if (vmx_get_cpl(vcpu)) { 4456 kvm_inject_gp(vcpu, 0); 4457 return 1; 4458 } 4459 4460 if (vmx->nested.vmxon) 4461 return nested_vmx_failValid(vcpu, 4462 VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4463 4464 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4465 != VMXON_NEEDED_FEATURES) { 4466 kvm_inject_gp(vcpu, 0); 4467 return 1; 4468 } 4469 4470 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4471 return 1; 4472 4473 /* 4474 * SDM 3: 24.11.5 4475 * The first 4 bytes of VMXON region contain the supported 4476 * VMCS revision identifier 4477 * 4478 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4479 * which replaces physical address width with 32 4480 */ 4481 if (!page_address_valid(vcpu, vmptr)) 4482 return nested_vmx_failInvalid(vcpu); 4483 4484 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4485 revision != VMCS12_REVISION) 4486 return nested_vmx_failInvalid(vcpu); 4487 4488 vmx->nested.vmxon_ptr = vmptr; 4489 ret = enter_vmx_operation(vcpu); 4490 if (ret) 4491 return ret; 4492 4493 return nested_vmx_succeed(vcpu); 4494 } 4495 4496 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4497 { 4498 struct vcpu_vmx *vmx = to_vmx(vcpu); 4499 4500 if (vmx->nested.current_vmptr == -1ull) 4501 return; 4502 4503 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4504 4505 if (enable_shadow_vmcs) { 4506 /* copy to memory all shadowed fields in case 4507 they were modified */ 4508 copy_shadow_to_vmcs12(vmx); 4509 vmx_disable_shadow_vmcs(vmx); 4510 } 4511 vmx->nested.posted_intr_nv = -1; 4512 4513 /* Flush VMCS12 to guest memory */ 4514 kvm_vcpu_write_guest_page(vcpu, 4515 vmx->nested.current_vmptr >> PAGE_SHIFT, 4516 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4517 4518 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4519 4520 vmx->nested.current_vmptr = -1ull; 4521 } 4522 4523 /* Emulate the VMXOFF instruction */ 4524 static int handle_vmoff(struct kvm_vcpu *vcpu) 4525 { 4526 if (!nested_vmx_check_permission(vcpu)) 4527 return 1; 4528 4529 free_nested(vcpu); 4530 4531 /* Process a latched INIT during time CPU was in VMX operation */ 4532 kvm_make_request(KVM_REQ_EVENT, vcpu); 4533 4534 return nested_vmx_succeed(vcpu); 4535 } 4536 4537 /* Emulate the VMCLEAR instruction */ 4538 static int handle_vmclear(struct kvm_vcpu *vcpu) 4539 { 4540 struct vcpu_vmx *vmx = to_vmx(vcpu); 4541 u32 zero = 0; 4542 gpa_t vmptr; 4543 u64 evmcs_gpa; 4544 4545 if (!nested_vmx_check_permission(vcpu)) 4546 return 1; 4547 4548 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4549 return 1; 4550 4551 if (!page_address_valid(vcpu, vmptr)) 4552 return nested_vmx_failValid(vcpu, 4553 VMXERR_VMCLEAR_INVALID_ADDRESS); 4554 4555 if (vmptr == vmx->nested.vmxon_ptr) 4556 return nested_vmx_failValid(vcpu, 4557 VMXERR_VMCLEAR_VMXON_POINTER); 4558 4559 /* 4560 * When Enlightened VMEntry is enabled on the calling CPU we treat 4561 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 4562 * way to distinguish it from VMCS12) and we must not corrupt it by 4563 * writing to the non-existent 'launch_state' field. The area doesn't 4564 * have to be the currently active EVMCS on the calling CPU and there's 4565 * nothing KVM has to do to transition it from 'active' to 'non-active' 4566 * state. It is possible that the area will stay mapped as 4567 * vmx->nested.hv_evmcs but this shouldn't be a problem. 4568 */ 4569 if (likely(!vmx->nested.enlightened_vmcs_enabled || 4570 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 4571 if (vmptr == vmx->nested.current_vmptr) 4572 nested_release_vmcs12(vcpu); 4573 4574 kvm_vcpu_write_guest(vcpu, 4575 vmptr + offsetof(struct vmcs12, 4576 launch_state), 4577 &zero, sizeof(zero)); 4578 } 4579 4580 return nested_vmx_succeed(vcpu); 4581 } 4582 4583 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); 4584 4585 /* Emulate the VMLAUNCH instruction */ 4586 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 4587 { 4588 return nested_vmx_run(vcpu, true); 4589 } 4590 4591 /* Emulate the VMRESUME instruction */ 4592 static int handle_vmresume(struct kvm_vcpu *vcpu) 4593 { 4594 4595 return nested_vmx_run(vcpu, false); 4596 } 4597 4598 static int handle_vmread(struct kvm_vcpu *vcpu) 4599 { 4600 unsigned long field; 4601 u64 field_value; 4602 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4603 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4604 int len; 4605 gva_t gva = 0; 4606 struct vmcs12 *vmcs12; 4607 struct x86_exception e; 4608 short offset; 4609 4610 if (!nested_vmx_check_permission(vcpu)) 4611 return 1; 4612 4613 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) 4614 return nested_vmx_failInvalid(vcpu); 4615 4616 if (!is_guest_mode(vcpu)) 4617 vmcs12 = get_vmcs12(vcpu); 4618 else { 4619 /* 4620 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD 4621 * to shadowed-field sets the ALU flags for VMfailInvalid. 4622 */ 4623 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4624 return nested_vmx_failInvalid(vcpu); 4625 vmcs12 = get_shadow_vmcs12(vcpu); 4626 } 4627 4628 /* Decode instruction info and find the field to read */ 4629 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4630 4631 offset = vmcs_field_to_offset(field); 4632 if (offset < 0) 4633 return nested_vmx_failValid(vcpu, 4634 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4635 4636 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 4637 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4638 4639 /* Read the field, zero-extended to a u64 field_value */ 4640 field_value = vmcs12_read_any(vmcs12, field, offset); 4641 4642 /* 4643 * Now copy part of this value to register or memory, as requested. 4644 * Note that the number of bits actually copied is 32 or 64 depending 4645 * on the guest's mode (32 or 64 bit), not on the given field's length. 4646 */ 4647 if (vmx_instruction_info & (1u << 10)) { 4648 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 4649 field_value); 4650 } else { 4651 len = is_64_bit_mode(vcpu) ? 8 : 4; 4652 if (get_vmx_mem_address(vcpu, exit_qualification, 4653 vmx_instruction_info, true, len, &gva)) 4654 return 1; 4655 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4656 if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) 4657 kvm_inject_page_fault(vcpu, &e); 4658 } 4659 4660 return nested_vmx_succeed(vcpu); 4661 } 4662 4663 static bool is_shadow_field_rw(unsigned long field) 4664 { 4665 switch (field) { 4666 #define SHADOW_FIELD_RW(x, y) case x: 4667 #include "vmcs_shadow_fields.h" 4668 return true; 4669 default: 4670 break; 4671 } 4672 return false; 4673 } 4674 4675 static bool is_shadow_field_ro(unsigned long field) 4676 { 4677 switch (field) { 4678 #define SHADOW_FIELD_RO(x, y) case x: 4679 #include "vmcs_shadow_fields.h" 4680 return true; 4681 default: 4682 break; 4683 } 4684 return false; 4685 } 4686 4687 static int handle_vmwrite(struct kvm_vcpu *vcpu) 4688 { 4689 unsigned long field; 4690 int len; 4691 gva_t gva; 4692 struct vcpu_vmx *vmx = to_vmx(vcpu); 4693 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4694 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4695 4696 /* The value to write might be 32 or 64 bits, depending on L1's long 4697 * mode, and eventually we need to write that into a field of several 4698 * possible lengths. The code below first zero-extends the value to 64 4699 * bit (field_value), and then copies only the appropriate number of 4700 * bits into the vmcs12 field. 4701 */ 4702 u64 field_value = 0; 4703 struct x86_exception e; 4704 struct vmcs12 *vmcs12; 4705 short offset; 4706 4707 if (!nested_vmx_check_permission(vcpu)) 4708 return 1; 4709 4710 if (vmx->nested.current_vmptr == -1ull) 4711 return nested_vmx_failInvalid(vcpu); 4712 4713 if (vmx_instruction_info & (1u << 10)) 4714 field_value = kvm_register_readl(vcpu, 4715 (((vmx_instruction_info) >> 3) & 0xf)); 4716 else { 4717 len = is_64_bit_mode(vcpu) ? 8 : 4; 4718 if (get_vmx_mem_address(vcpu, exit_qualification, 4719 vmx_instruction_info, false, len, &gva)) 4720 return 1; 4721 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { 4722 kvm_inject_page_fault(vcpu, &e); 4723 return 1; 4724 } 4725 } 4726 4727 4728 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4729 /* 4730 * If the vCPU supports "VMWRITE to any supported field in the 4731 * VMCS," then the "read-only" fields are actually read/write. 4732 */ 4733 if (vmcs_field_readonly(field) && 4734 !nested_cpu_has_vmwrite_any_field(vcpu)) 4735 return nested_vmx_failValid(vcpu, 4736 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 4737 4738 if (!is_guest_mode(vcpu)) { 4739 vmcs12 = get_vmcs12(vcpu); 4740 4741 /* 4742 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 4743 * vmcs12, else we may crush a field or consume a stale value. 4744 */ 4745 if (!is_shadow_field_rw(field)) 4746 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4747 } else { 4748 /* 4749 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE 4750 * to shadowed-field sets the ALU flags for VMfailInvalid. 4751 */ 4752 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4753 return nested_vmx_failInvalid(vcpu); 4754 vmcs12 = get_shadow_vmcs12(vcpu); 4755 } 4756 4757 offset = vmcs_field_to_offset(field); 4758 if (offset < 0) 4759 return nested_vmx_failValid(vcpu, 4760 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4761 4762 /* 4763 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 4764 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 4765 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 4766 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 4767 * from L1 will return a different value than VMREAD from L2 (L1 sees 4768 * the stripped down value, L2 sees the full value as stored by KVM). 4769 */ 4770 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 4771 field_value &= 0x1f0ff; 4772 4773 vmcs12_write_any(vmcs12, field, offset, field_value); 4774 4775 /* 4776 * Do not track vmcs12 dirty-state if in guest-mode as we actually 4777 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 4778 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 4779 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 4780 */ 4781 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 4782 /* 4783 * L1 can read these fields without exiting, ensure the 4784 * shadow VMCS is up-to-date. 4785 */ 4786 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 4787 preempt_disable(); 4788 vmcs_load(vmx->vmcs01.shadow_vmcs); 4789 4790 __vmcs_writel(field, field_value); 4791 4792 vmcs_clear(vmx->vmcs01.shadow_vmcs); 4793 vmcs_load(vmx->loaded_vmcs->vmcs); 4794 preempt_enable(); 4795 } 4796 vmx->nested.dirty_vmcs12 = true; 4797 } 4798 4799 return nested_vmx_succeed(vcpu); 4800 } 4801 4802 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 4803 { 4804 vmx->nested.current_vmptr = vmptr; 4805 if (enable_shadow_vmcs) { 4806 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 4807 vmcs_write64(VMCS_LINK_POINTER, 4808 __pa(vmx->vmcs01.shadow_vmcs)); 4809 vmx->nested.need_vmcs12_to_shadow_sync = true; 4810 } 4811 vmx->nested.dirty_vmcs12 = true; 4812 } 4813 4814 /* Emulate the VMPTRLD instruction */ 4815 static int handle_vmptrld(struct kvm_vcpu *vcpu) 4816 { 4817 struct vcpu_vmx *vmx = to_vmx(vcpu); 4818 gpa_t vmptr; 4819 4820 if (!nested_vmx_check_permission(vcpu)) 4821 return 1; 4822 4823 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4824 return 1; 4825 4826 if (!page_address_valid(vcpu, vmptr)) 4827 return nested_vmx_failValid(vcpu, 4828 VMXERR_VMPTRLD_INVALID_ADDRESS); 4829 4830 if (vmptr == vmx->nested.vmxon_ptr) 4831 return nested_vmx_failValid(vcpu, 4832 VMXERR_VMPTRLD_VMXON_POINTER); 4833 4834 /* Forbid normal VMPTRLD if Enlightened version was used */ 4835 if (vmx->nested.hv_evmcs) 4836 return 1; 4837 4838 if (vmx->nested.current_vmptr != vmptr) { 4839 struct kvm_host_map map; 4840 struct vmcs12 *new_vmcs12; 4841 4842 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 4843 /* 4844 * Reads from an unbacked page return all 1s, 4845 * which means that the 32 bits located at the 4846 * given physical address won't match the required 4847 * VMCS12_REVISION identifier. 4848 */ 4849 return nested_vmx_failValid(vcpu, 4850 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4851 } 4852 4853 new_vmcs12 = map.hva; 4854 4855 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 4856 (new_vmcs12->hdr.shadow_vmcs && 4857 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 4858 kvm_vcpu_unmap(vcpu, &map, false); 4859 return nested_vmx_failValid(vcpu, 4860 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4861 } 4862 4863 nested_release_vmcs12(vcpu); 4864 4865 /* 4866 * Load VMCS12 from guest memory since it is not already 4867 * cached. 4868 */ 4869 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 4870 kvm_vcpu_unmap(vcpu, &map, false); 4871 4872 set_current_vmptr(vmx, vmptr); 4873 } 4874 4875 return nested_vmx_succeed(vcpu); 4876 } 4877 4878 /* Emulate the VMPTRST instruction */ 4879 static int handle_vmptrst(struct kvm_vcpu *vcpu) 4880 { 4881 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); 4882 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4883 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 4884 struct x86_exception e; 4885 gva_t gva; 4886 4887 if (!nested_vmx_check_permission(vcpu)) 4888 return 1; 4889 4890 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) 4891 return 1; 4892 4893 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 4894 true, sizeof(gpa_t), &gva)) 4895 return 1; 4896 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 4897 if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 4898 sizeof(gpa_t), &e)) { 4899 kvm_inject_page_fault(vcpu, &e); 4900 return 1; 4901 } 4902 return nested_vmx_succeed(vcpu); 4903 } 4904 4905 /* Emulate the INVEPT instruction */ 4906 static int handle_invept(struct kvm_vcpu *vcpu) 4907 { 4908 struct vcpu_vmx *vmx = to_vmx(vcpu); 4909 u32 vmx_instruction_info, types; 4910 unsigned long type; 4911 gva_t gva; 4912 struct x86_exception e; 4913 struct { 4914 u64 eptp, gpa; 4915 } operand; 4916 4917 if (!(vmx->nested.msrs.secondary_ctls_high & 4918 SECONDARY_EXEC_ENABLE_EPT) || 4919 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 4920 kvm_queue_exception(vcpu, UD_VECTOR); 4921 return 1; 4922 } 4923 4924 if (!nested_vmx_check_permission(vcpu)) 4925 return 1; 4926 4927 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4928 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 4929 4930 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 4931 4932 if (type >= 32 || !(types & (1 << type))) 4933 return nested_vmx_failValid(vcpu, 4934 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4935 4936 /* According to the Intel VMX instruction reference, the memory 4937 * operand is read even if it isn't needed (e.g., for type==global) 4938 */ 4939 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4940 vmx_instruction_info, false, sizeof(operand), &gva)) 4941 return 1; 4942 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 4943 kvm_inject_page_fault(vcpu, &e); 4944 return 1; 4945 } 4946 4947 switch (type) { 4948 case VMX_EPT_EXTENT_GLOBAL: 4949 case VMX_EPT_EXTENT_CONTEXT: 4950 /* 4951 * TODO: Sync the necessary shadow EPT roots here, rather than 4952 * at the next emulated VM-entry. 4953 */ 4954 break; 4955 default: 4956 BUG_ON(1); 4957 break; 4958 } 4959 4960 return nested_vmx_succeed(vcpu); 4961 } 4962 4963 static int handle_invvpid(struct kvm_vcpu *vcpu) 4964 { 4965 struct vcpu_vmx *vmx = to_vmx(vcpu); 4966 u32 vmx_instruction_info; 4967 unsigned long type, types; 4968 gva_t gva; 4969 struct x86_exception e; 4970 struct { 4971 u64 vpid; 4972 u64 gla; 4973 } operand; 4974 u16 vpid02; 4975 4976 if (!(vmx->nested.msrs.secondary_ctls_high & 4977 SECONDARY_EXEC_ENABLE_VPID) || 4978 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 4979 kvm_queue_exception(vcpu, UD_VECTOR); 4980 return 1; 4981 } 4982 4983 if (!nested_vmx_check_permission(vcpu)) 4984 return 1; 4985 4986 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4987 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 4988 4989 types = (vmx->nested.msrs.vpid_caps & 4990 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 4991 4992 if (type >= 32 || !(types & (1 << type))) 4993 return nested_vmx_failValid(vcpu, 4994 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4995 4996 /* according to the intel vmx instruction reference, the memory 4997 * operand is read even if it isn't needed (e.g., for type==global) 4998 */ 4999 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 5000 vmx_instruction_info, false, sizeof(operand), &gva)) 5001 return 1; 5002 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 5003 kvm_inject_page_fault(vcpu, &e); 5004 return 1; 5005 } 5006 if (operand.vpid >> 16) 5007 return nested_vmx_failValid(vcpu, 5008 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5009 5010 vpid02 = nested_get_vpid02(vcpu); 5011 switch (type) { 5012 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5013 if (!operand.vpid || 5014 is_noncanonical_address(operand.gla, vcpu)) 5015 return nested_vmx_failValid(vcpu, 5016 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5017 if (cpu_has_vmx_invvpid_individual_addr()) { 5018 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, 5019 vpid02, operand.gla); 5020 } else 5021 __vmx_flush_tlb(vcpu, vpid02, false); 5022 break; 5023 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5024 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5025 if (!operand.vpid) 5026 return nested_vmx_failValid(vcpu, 5027 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5028 __vmx_flush_tlb(vcpu, vpid02, false); 5029 break; 5030 case VMX_VPID_EXTENT_ALL_CONTEXT: 5031 __vmx_flush_tlb(vcpu, vpid02, false); 5032 break; 5033 default: 5034 WARN_ON_ONCE(1); 5035 return kvm_skip_emulated_instruction(vcpu); 5036 } 5037 5038 return nested_vmx_succeed(vcpu); 5039 } 5040 5041 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5042 struct vmcs12 *vmcs12) 5043 { 5044 u32 index = kvm_rcx_read(vcpu); 5045 u64 address; 5046 bool accessed_dirty; 5047 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 5048 5049 if (!nested_cpu_has_eptp_switching(vmcs12) || 5050 !nested_cpu_has_ept(vmcs12)) 5051 return 1; 5052 5053 if (index >= VMFUNC_EPTP_ENTRIES) 5054 return 1; 5055 5056 5057 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5058 &address, index * 8, 8)) 5059 return 1; 5060 5061 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); 5062 5063 /* 5064 * If the (L2) guest does a vmfunc to the currently 5065 * active ept pointer, we don't have to do anything else 5066 */ 5067 if (vmcs12->ept_pointer != address) { 5068 if (!valid_ept_address(vcpu, address)) 5069 return 1; 5070 5071 kvm_mmu_unload(vcpu); 5072 mmu->ept_ad = accessed_dirty; 5073 mmu->mmu_role.base.ad_disabled = !accessed_dirty; 5074 vmcs12->ept_pointer = address; 5075 /* 5076 * TODO: Check what's the correct approach in case 5077 * mmu reload fails. Currently, we just let the next 5078 * reload potentially fail 5079 */ 5080 kvm_mmu_reload(vcpu); 5081 } 5082 5083 return 0; 5084 } 5085 5086 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5087 { 5088 struct vcpu_vmx *vmx = to_vmx(vcpu); 5089 struct vmcs12 *vmcs12; 5090 u32 function = kvm_rax_read(vcpu); 5091 5092 /* 5093 * VMFUNC is only supported for nested guests, but we always enable the 5094 * secondary control for simplicity; for non-nested mode, fake that we 5095 * didn't by injecting #UD. 5096 */ 5097 if (!is_guest_mode(vcpu)) { 5098 kvm_queue_exception(vcpu, UD_VECTOR); 5099 return 1; 5100 } 5101 5102 vmcs12 = get_vmcs12(vcpu); 5103 if ((vmcs12->vm_function_control & (1 << function)) == 0) 5104 goto fail; 5105 5106 switch (function) { 5107 case 0: 5108 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5109 goto fail; 5110 break; 5111 default: 5112 goto fail; 5113 } 5114 return kvm_skip_emulated_instruction(vcpu); 5115 5116 fail: 5117 nested_vmx_vmexit(vcpu, vmx->exit_reason, 5118 vmcs_read32(VM_EXIT_INTR_INFO), 5119 vmcs_readl(EXIT_QUALIFICATION)); 5120 return 1; 5121 } 5122 5123 5124 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5125 struct vmcs12 *vmcs12) 5126 { 5127 unsigned long exit_qualification; 5128 gpa_t bitmap, last_bitmap; 5129 unsigned int port; 5130 int size; 5131 u8 b; 5132 5133 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5134 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5135 5136 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5137 5138 port = exit_qualification >> 16; 5139 size = (exit_qualification & 7) + 1; 5140 5141 last_bitmap = (gpa_t)-1; 5142 b = -1; 5143 5144 while (size > 0) { 5145 if (port < 0x8000) 5146 bitmap = vmcs12->io_bitmap_a; 5147 else if (port < 0x10000) 5148 bitmap = vmcs12->io_bitmap_b; 5149 else 5150 return true; 5151 bitmap += (port & 0x7fff) / 8; 5152 5153 if (last_bitmap != bitmap) 5154 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5155 return true; 5156 if (b & (1 << (port & 7))) 5157 return true; 5158 5159 port++; 5160 size--; 5161 last_bitmap = bitmap; 5162 } 5163 5164 return false; 5165 } 5166 5167 /* 5168 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 5169 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5170 * disinterest in the current event (read or write a specific MSR) by using an 5171 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5172 */ 5173 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5174 struct vmcs12 *vmcs12, u32 exit_reason) 5175 { 5176 u32 msr_index = kvm_rcx_read(vcpu); 5177 gpa_t bitmap; 5178 5179 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5180 return true; 5181 5182 /* 5183 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5184 * for the four combinations of read/write and low/high MSR numbers. 5185 * First we need to figure out which of the four to use: 5186 */ 5187 bitmap = vmcs12->msr_bitmap; 5188 if (exit_reason == EXIT_REASON_MSR_WRITE) 5189 bitmap += 2048; 5190 if (msr_index >= 0xc0000000) { 5191 msr_index -= 0xc0000000; 5192 bitmap += 1024; 5193 } 5194 5195 /* Then read the msr_index'th bit from this bitmap: */ 5196 if (msr_index < 1024*8) { 5197 unsigned char b; 5198 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5199 return true; 5200 return 1 & (b >> (msr_index & 7)); 5201 } else 5202 return true; /* let L1 handle the wrong parameter */ 5203 } 5204 5205 /* 5206 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5207 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5208 * intercept (via guest_host_mask etc.) the current event. 5209 */ 5210 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5211 struct vmcs12 *vmcs12) 5212 { 5213 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5214 int cr = exit_qualification & 15; 5215 int reg; 5216 unsigned long val; 5217 5218 switch ((exit_qualification >> 4) & 3) { 5219 case 0: /* mov to cr */ 5220 reg = (exit_qualification >> 8) & 15; 5221 val = kvm_register_readl(vcpu, reg); 5222 switch (cr) { 5223 case 0: 5224 if (vmcs12->cr0_guest_host_mask & 5225 (val ^ vmcs12->cr0_read_shadow)) 5226 return true; 5227 break; 5228 case 3: 5229 if ((vmcs12->cr3_target_count >= 1 && 5230 vmcs12->cr3_target_value0 == val) || 5231 (vmcs12->cr3_target_count >= 2 && 5232 vmcs12->cr3_target_value1 == val) || 5233 (vmcs12->cr3_target_count >= 3 && 5234 vmcs12->cr3_target_value2 == val) || 5235 (vmcs12->cr3_target_count >= 4 && 5236 vmcs12->cr3_target_value3 == val)) 5237 return false; 5238 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5239 return true; 5240 break; 5241 case 4: 5242 if (vmcs12->cr4_guest_host_mask & 5243 (vmcs12->cr4_read_shadow ^ val)) 5244 return true; 5245 break; 5246 case 8: 5247 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5248 return true; 5249 break; 5250 } 5251 break; 5252 case 2: /* clts */ 5253 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5254 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5255 return true; 5256 break; 5257 case 1: /* mov from cr */ 5258 switch (cr) { 5259 case 3: 5260 if (vmcs12->cpu_based_vm_exec_control & 5261 CPU_BASED_CR3_STORE_EXITING) 5262 return true; 5263 break; 5264 case 8: 5265 if (vmcs12->cpu_based_vm_exec_control & 5266 CPU_BASED_CR8_STORE_EXITING) 5267 return true; 5268 break; 5269 } 5270 break; 5271 case 3: /* lmsw */ 5272 /* 5273 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5274 * cr0. Other attempted changes are ignored, with no exit. 5275 */ 5276 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5277 if (vmcs12->cr0_guest_host_mask & 0xe & 5278 (val ^ vmcs12->cr0_read_shadow)) 5279 return true; 5280 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5281 !(vmcs12->cr0_read_shadow & 0x1) && 5282 (val & 0x1)) 5283 return true; 5284 break; 5285 } 5286 return false; 5287 } 5288 5289 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5290 struct vmcs12 *vmcs12, gpa_t bitmap) 5291 { 5292 u32 vmx_instruction_info; 5293 unsigned long field; 5294 u8 b; 5295 5296 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5297 return true; 5298 5299 /* Decode instruction info and find the field to access */ 5300 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5301 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5302 5303 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5304 if (field >> 15) 5305 return true; 5306 5307 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5308 return true; 5309 5310 return 1 & (b >> (field & 7)); 5311 } 5312 5313 /* 5314 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 5315 * should handle it ourselves in L0 (and then continue L2). Only call this 5316 * when in is_guest_mode (L2). 5317 */ 5318 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) 5319 { 5320 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 5321 struct vcpu_vmx *vmx = to_vmx(vcpu); 5322 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5323 5324 if (vmx->nested.nested_run_pending) 5325 return false; 5326 5327 if (unlikely(vmx->fail)) { 5328 trace_kvm_nested_vmenter_failed( 5329 "hardware VM-instruction error: ", 5330 vmcs_read32(VM_INSTRUCTION_ERROR)); 5331 return true; 5332 } 5333 5334 /* 5335 * The host physical addresses of some pages of guest memory 5336 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 5337 * Page). The CPU may write to these pages via their host 5338 * physical address while L2 is running, bypassing any 5339 * address-translation-based dirty tracking (e.g. EPT write 5340 * protection). 5341 * 5342 * Mark them dirty on every exit from L2 to prevent them from 5343 * getting out of sync with dirty tracking. 5344 */ 5345 nested_mark_vmcs12_pages_dirty(vcpu); 5346 5347 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 5348 vmcs_readl(EXIT_QUALIFICATION), 5349 vmx->idt_vectoring_info, 5350 intr_info, 5351 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 5352 KVM_ISA_VMX); 5353 5354 switch (exit_reason) { 5355 case EXIT_REASON_EXCEPTION_NMI: 5356 if (is_nmi(intr_info)) 5357 return false; 5358 else if (is_page_fault(intr_info)) 5359 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; 5360 else if (is_debug(intr_info) && 5361 vcpu->guest_debug & 5362 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5363 return false; 5364 else if (is_breakpoint(intr_info) && 5365 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5366 return false; 5367 return vmcs12->exception_bitmap & 5368 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5369 case EXIT_REASON_EXTERNAL_INTERRUPT: 5370 return false; 5371 case EXIT_REASON_TRIPLE_FAULT: 5372 return true; 5373 case EXIT_REASON_PENDING_INTERRUPT: 5374 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 5375 case EXIT_REASON_NMI_WINDOW: 5376 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 5377 case EXIT_REASON_TASK_SWITCH: 5378 return true; 5379 case EXIT_REASON_CPUID: 5380 return true; 5381 case EXIT_REASON_HLT: 5382 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5383 case EXIT_REASON_INVD: 5384 return true; 5385 case EXIT_REASON_INVLPG: 5386 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5387 case EXIT_REASON_RDPMC: 5388 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5389 case EXIT_REASON_RDRAND: 5390 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5391 case EXIT_REASON_RDSEED: 5392 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5393 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5394 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5395 case EXIT_REASON_VMREAD: 5396 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5397 vmcs12->vmread_bitmap); 5398 case EXIT_REASON_VMWRITE: 5399 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5400 vmcs12->vmwrite_bitmap); 5401 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5402 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5403 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5404 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5405 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5406 /* 5407 * VMX instructions trap unconditionally. This allows L1 to 5408 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5409 */ 5410 return true; 5411 case EXIT_REASON_CR_ACCESS: 5412 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5413 case EXIT_REASON_DR_ACCESS: 5414 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5415 case EXIT_REASON_IO_INSTRUCTION: 5416 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5417 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5418 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5419 case EXIT_REASON_MSR_READ: 5420 case EXIT_REASON_MSR_WRITE: 5421 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5422 case EXIT_REASON_INVALID_STATE: 5423 return true; 5424 case EXIT_REASON_MWAIT_INSTRUCTION: 5425 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5426 case EXIT_REASON_MONITOR_TRAP_FLAG: 5427 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); 5428 case EXIT_REASON_MONITOR_INSTRUCTION: 5429 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5430 case EXIT_REASON_PAUSE_INSTRUCTION: 5431 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5432 nested_cpu_has2(vmcs12, 5433 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5434 case EXIT_REASON_MCE_DURING_VMENTRY: 5435 return false; 5436 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5437 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5438 case EXIT_REASON_APIC_ACCESS: 5439 case EXIT_REASON_APIC_WRITE: 5440 case EXIT_REASON_EOI_INDUCED: 5441 /* 5442 * The controls for "virtualize APIC accesses," "APIC- 5443 * register virtualization," and "virtual-interrupt 5444 * delivery" only come from vmcs12. 5445 */ 5446 return true; 5447 case EXIT_REASON_EPT_VIOLATION: 5448 /* 5449 * L0 always deals with the EPT violation. If nested EPT is 5450 * used, and the nested mmu code discovers that the address is 5451 * missing in the guest EPT table (EPT12), the EPT violation 5452 * will be injected with nested_ept_inject_page_fault() 5453 */ 5454 return false; 5455 case EXIT_REASON_EPT_MISCONFIG: 5456 /* 5457 * L2 never uses directly L1's EPT, but rather L0's own EPT 5458 * table (shadow on EPT) or a merged EPT table that L0 built 5459 * (EPT on EPT). So any problems with the structure of the 5460 * table is L0's fault. 5461 */ 5462 return false; 5463 case EXIT_REASON_INVPCID: 5464 return 5465 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5466 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5467 case EXIT_REASON_WBINVD: 5468 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5469 case EXIT_REASON_XSETBV: 5470 return true; 5471 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5472 /* 5473 * This should never happen, since it is not possible to 5474 * set XSS to a non-zero value---neither in L1 nor in L2. 5475 * If if it were, XSS would have to be checked against 5476 * the XSS exit bitmap in vmcs12. 5477 */ 5478 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5479 case EXIT_REASON_PREEMPTION_TIMER: 5480 return false; 5481 case EXIT_REASON_PML_FULL: 5482 /* We emulate PML support to L1. */ 5483 return false; 5484 case EXIT_REASON_VMFUNC: 5485 /* VM functions are emulated through L2->L0 vmexits. */ 5486 return false; 5487 case EXIT_REASON_ENCLS: 5488 /* SGX is never exposed to L1 */ 5489 return false; 5490 case EXIT_REASON_UMWAIT: 5491 case EXIT_REASON_TPAUSE: 5492 return nested_cpu_has2(vmcs12, 5493 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 5494 default: 5495 return true; 5496 } 5497 } 5498 5499 5500 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 5501 struct kvm_nested_state __user *user_kvm_nested_state, 5502 u32 user_data_size) 5503 { 5504 struct vcpu_vmx *vmx; 5505 struct vmcs12 *vmcs12; 5506 struct kvm_nested_state kvm_state = { 5507 .flags = 0, 5508 .format = KVM_STATE_NESTED_FORMAT_VMX, 5509 .size = sizeof(kvm_state), 5510 .hdr.vmx.vmxon_pa = -1ull, 5511 .hdr.vmx.vmcs12_pa = -1ull, 5512 }; 5513 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5514 &user_kvm_nested_state->data.vmx[0]; 5515 5516 if (!vcpu) 5517 return kvm_state.size + sizeof(*user_vmx_nested_state); 5518 5519 vmx = to_vmx(vcpu); 5520 vmcs12 = get_vmcs12(vcpu); 5521 5522 if (nested_vmx_allowed(vcpu) && 5523 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 5524 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 5525 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 5526 5527 if (vmx_has_valid_vmcs12(vcpu)) { 5528 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 5529 5530 if (vmx->nested.hv_evmcs) 5531 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 5532 5533 if (is_guest_mode(vcpu) && 5534 nested_cpu_has_shadow_vmcs(vmcs12) && 5535 vmcs12->vmcs_link_pointer != -1ull) 5536 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 5537 } 5538 5539 if (vmx->nested.smm.vmxon) 5540 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 5541 5542 if (vmx->nested.smm.guest_mode) 5543 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 5544 5545 if (is_guest_mode(vcpu)) { 5546 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 5547 5548 if (vmx->nested.nested_run_pending) 5549 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 5550 } 5551 } 5552 5553 if (user_data_size < kvm_state.size) 5554 goto out; 5555 5556 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 5557 return -EFAULT; 5558 5559 if (!vmx_has_valid_vmcs12(vcpu)) 5560 goto out; 5561 5562 /* 5563 * When running L2, the authoritative vmcs12 state is in the 5564 * vmcs02. When running L1, the authoritative vmcs12 state is 5565 * in the shadow or enlightened vmcs linked to vmcs01, unless 5566 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 5567 * vmcs12 state is in the vmcs12 already. 5568 */ 5569 if (is_guest_mode(vcpu)) { 5570 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5571 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5572 } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { 5573 if (vmx->nested.hv_evmcs) 5574 copy_enlightened_to_vmcs12(vmx); 5575 else if (enable_shadow_vmcs) 5576 copy_shadow_to_vmcs12(vmx); 5577 } 5578 5579 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 5580 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 5581 5582 /* 5583 * Copy over the full allocated size of vmcs12 rather than just the size 5584 * of the struct. 5585 */ 5586 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 5587 return -EFAULT; 5588 5589 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5590 vmcs12->vmcs_link_pointer != -1ull) { 5591 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 5592 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 5593 return -EFAULT; 5594 } 5595 5596 out: 5597 return kvm_state.size; 5598 } 5599 5600 /* 5601 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 5602 */ 5603 void vmx_leave_nested(struct kvm_vcpu *vcpu) 5604 { 5605 if (is_guest_mode(vcpu)) { 5606 to_vmx(vcpu)->nested.nested_run_pending = 0; 5607 nested_vmx_vmexit(vcpu, -1, 0, 0); 5608 } 5609 free_nested(vcpu); 5610 } 5611 5612 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 5613 struct kvm_nested_state __user *user_kvm_nested_state, 5614 struct kvm_nested_state *kvm_state) 5615 { 5616 struct vcpu_vmx *vmx = to_vmx(vcpu); 5617 struct vmcs12 *vmcs12; 5618 u32 exit_qual; 5619 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5620 &user_kvm_nested_state->data.vmx[0]; 5621 int ret; 5622 5623 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 5624 return -EINVAL; 5625 5626 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { 5627 if (kvm_state->hdr.vmx.smm.flags) 5628 return -EINVAL; 5629 5630 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) 5631 return -EINVAL; 5632 5633 /* 5634 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 5635 * enable eVMCS capability on vCPU. However, since then 5636 * code was changed such that flag signals vmcs12 should 5637 * be copied into eVMCS in guest memory. 5638 * 5639 * To preserve backwards compatability, allow user 5640 * to set this flag even when there is no VMXON region. 5641 */ 5642 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 5643 return -EINVAL; 5644 } else { 5645 if (!nested_vmx_allowed(vcpu)) 5646 return -EINVAL; 5647 5648 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 5649 return -EINVAL; 5650 } 5651 5652 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5653 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5654 return -EINVAL; 5655 5656 if (kvm_state->hdr.vmx.smm.flags & 5657 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 5658 return -EINVAL; 5659 5660 /* 5661 * SMM temporarily disables VMX, so we cannot be in guest mode, 5662 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 5663 * must be zero. 5664 */ 5665 if (is_smm(vcpu) ? 5666 (kvm_state->flags & 5667 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 5668 : kvm_state->hdr.vmx.smm.flags) 5669 return -EINVAL; 5670 5671 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5672 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 5673 return -EINVAL; 5674 5675 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 5676 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 5677 return -EINVAL; 5678 5679 vmx_leave_nested(vcpu); 5680 5681 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) 5682 return 0; 5683 5684 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 5685 ret = enter_vmx_operation(vcpu); 5686 if (ret) 5687 return ret; 5688 5689 /* Empty 'VMXON' state is permitted */ 5690 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) 5691 return 0; 5692 5693 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { 5694 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 5695 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 5696 return -EINVAL; 5697 5698 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 5699 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 5700 /* 5701 * Sync eVMCS upon entry as we may not have 5702 * HV_X64_MSR_VP_ASSIST_PAGE set up yet. 5703 */ 5704 vmx->nested.need_vmcs12_to_shadow_sync = true; 5705 } else { 5706 return -EINVAL; 5707 } 5708 5709 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 5710 vmx->nested.smm.vmxon = true; 5711 vmx->nested.vmxon = false; 5712 5713 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 5714 vmx->nested.smm.guest_mode = true; 5715 } 5716 5717 vmcs12 = get_vmcs12(vcpu); 5718 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 5719 return -EFAULT; 5720 5721 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 5722 return -EINVAL; 5723 5724 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5725 return 0; 5726 5727 vmx->nested.nested_run_pending = 5728 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 5729 5730 ret = -EINVAL; 5731 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5732 vmcs12->vmcs_link_pointer != -1ull) { 5733 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 5734 5735 if (kvm_state->size < 5736 sizeof(*kvm_state) + 5737 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 5738 goto error_guest_mode; 5739 5740 if (copy_from_user(shadow_vmcs12, 5741 user_vmx_nested_state->shadow_vmcs12, 5742 sizeof(*shadow_vmcs12))) { 5743 ret = -EFAULT; 5744 goto error_guest_mode; 5745 } 5746 5747 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 5748 !shadow_vmcs12->hdr.shadow_vmcs) 5749 goto error_guest_mode; 5750 } 5751 5752 if (nested_vmx_check_controls(vcpu, vmcs12) || 5753 nested_vmx_check_host_state(vcpu, vmcs12) || 5754 nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 5755 goto error_guest_mode; 5756 5757 vmx->nested.dirty_vmcs12 = true; 5758 ret = nested_vmx_enter_non_root_mode(vcpu, false); 5759 if (ret) 5760 goto error_guest_mode; 5761 5762 return 0; 5763 5764 error_guest_mode: 5765 vmx->nested.nested_run_pending = 0; 5766 return ret; 5767 } 5768 5769 void nested_vmx_vcpu_setup(void) 5770 { 5771 if (enable_shadow_vmcs) { 5772 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 5773 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 5774 } 5775 } 5776 5777 /* 5778 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 5779 * returned for the various VMX controls MSRs when nested VMX is enabled. 5780 * The same values should also be used to verify that vmcs12 control fields are 5781 * valid during nested entry from L1 to L2. 5782 * Each of these control msrs has a low and high 32-bit half: A low bit is on 5783 * if the corresponding bit in the (32-bit) control field *must* be on, and a 5784 * bit in the high half is on if the corresponding bit in the control field 5785 * may be on. See also vmx_control_verify(). 5786 */ 5787 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, 5788 bool apicv) 5789 { 5790 /* 5791 * Note that as a general rule, the high half of the MSRs (bits in 5792 * the control fields which may be 1) should be initialized by the 5793 * intersection of the underlying hardware's MSR (i.e., features which 5794 * can be supported) and the list of features we want to expose - 5795 * because they are known to be properly supported in our code. 5796 * Also, usually, the low half of the MSRs (bits which must be 1) can 5797 * be set to 0, meaning that L1 may turn off any of these bits. The 5798 * reason is that if one of these bits is necessary, it will appear 5799 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 5800 * fields of vmcs01 and vmcs02, will turn these bits off - and 5801 * nested_vmx_exit_reflected() will not pass related exits to L1. 5802 * These rules have exceptions below. 5803 */ 5804 5805 /* pin-based controls */ 5806 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 5807 msrs->pinbased_ctls_low, 5808 msrs->pinbased_ctls_high); 5809 msrs->pinbased_ctls_low |= 5810 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5811 msrs->pinbased_ctls_high &= 5812 PIN_BASED_EXT_INTR_MASK | 5813 PIN_BASED_NMI_EXITING | 5814 PIN_BASED_VIRTUAL_NMIS | 5815 (apicv ? PIN_BASED_POSTED_INTR : 0); 5816 msrs->pinbased_ctls_high |= 5817 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 5818 PIN_BASED_VMX_PREEMPTION_TIMER; 5819 5820 /* exit controls */ 5821 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 5822 msrs->exit_ctls_low, 5823 msrs->exit_ctls_high); 5824 msrs->exit_ctls_low = 5825 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 5826 5827 msrs->exit_ctls_high &= 5828 #ifdef CONFIG_X86_64 5829 VM_EXIT_HOST_ADDR_SPACE_SIZE | 5830 #endif 5831 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 5832 msrs->exit_ctls_high |= 5833 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 5834 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 5835 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 5836 5837 /* We support free control of debug control saving. */ 5838 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 5839 5840 /* entry controls */ 5841 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 5842 msrs->entry_ctls_low, 5843 msrs->entry_ctls_high); 5844 msrs->entry_ctls_low = 5845 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 5846 msrs->entry_ctls_high &= 5847 #ifdef CONFIG_X86_64 5848 VM_ENTRY_IA32E_MODE | 5849 #endif 5850 VM_ENTRY_LOAD_IA32_PAT; 5851 msrs->entry_ctls_high |= 5852 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 5853 5854 /* We support free control of debug control loading. */ 5855 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 5856 5857 /* cpu-based controls */ 5858 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 5859 msrs->procbased_ctls_low, 5860 msrs->procbased_ctls_high); 5861 msrs->procbased_ctls_low = 5862 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5863 msrs->procbased_ctls_high &= 5864 CPU_BASED_VIRTUAL_INTR_PENDING | 5865 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 5866 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 5867 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 5868 CPU_BASED_CR3_STORE_EXITING | 5869 #ifdef CONFIG_X86_64 5870 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 5871 #endif 5872 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 5873 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 5874 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 5875 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 5876 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 5877 /* 5878 * We can allow some features even when not supported by the 5879 * hardware. For example, L1 can specify an MSR bitmap - and we 5880 * can use it to avoid exits to L1 - even when L0 runs L2 5881 * without MSR bitmaps. 5882 */ 5883 msrs->procbased_ctls_high |= 5884 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 5885 CPU_BASED_USE_MSR_BITMAPS; 5886 5887 /* We support free control of CR3 access interception. */ 5888 msrs->procbased_ctls_low &= 5889 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 5890 5891 /* 5892 * secondary cpu-based controls. Do not include those that 5893 * depend on CPUID bits, they are added later by vmx_cpuid_update. 5894 */ 5895 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 5896 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 5897 msrs->secondary_ctls_low, 5898 msrs->secondary_ctls_high); 5899 5900 msrs->secondary_ctls_low = 0; 5901 msrs->secondary_ctls_high &= 5902 SECONDARY_EXEC_DESC | 5903 SECONDARY_EXEC_RDTSCP | 5904 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 5905 SECONDARY_EXEC_WBINVD_EXITING | 5906 SECONDARY_EXEC_APIC_REGISTER_VIRT | 5907 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 5908 SECONDARY_EXEC_RDRAND_EXITING | 5909 SECONDARY_EXEC_ENABLE_INVPCID | 5910 SECONDARY_EXEC_RDSEED_EXITING | 5911 SECONDARY_EXEC_XSAVES; 5912 5913 /* 5914 * We can emulate "VMCS shadowing," even if the hardware 5915 * doesn't support it. 5916 */ 5917 msrs->secondary_ctls_high |= 5918 SECONDARY_EXEC_SHADOW_VMCS; 5919 5920 if (enable_ept) { 5921 /* nested EPT: emulate EPT also to L1 */ 5922 msrs->secondary_ctls_high |= 5923 SECONDARY_EXEC_ENABLE_EPT; 5924 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 5925 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; 5926 if (cpu_has_vmx_ept_execute_only()) 5927 msrs->ept_caps |= 5928 VMX_EPT_EXECUTE_ONLY_BIT; 5929 msrs->ept_caps &= ept_caps; 5930 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 5931 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 5932 VMX_EPT_1GB_PAGE_BIT; 5933 if (enable_ept_ad_bits) { 5934 msrs->secondary_ctls_high |= 5935 SECONDARY_EXEC_ENABLE_PML; 5936 msrs->ept_caps |= VMX_EPT_AD_BIT; 5937 } 5938 } 5939 5940 if (cpu_has_vmx_vmfunc()) { 5941 msrs->secondary_ctls_high |= 5942 SECONDARY_EXEC_ENABLE_VMFUNC; 5943 /* 5944 * Advertise EPTP switching unconditionally 5945 * since we emulate it 5946 */ 5947 if (enable_ept) 5948 msrs->vmfunc_controls = 5949 VMX_VMFUNC_EPTP_SWITCHING; 5950 } 5951 5952 /* 5953 * Old versions of KVM use the single-context version without 5954 * checking for support, so declare that it is supported even 5955 * though it is treated as global context. The alternative is 5956 * not failing the single-context invvpid, and it is worse. 5957 */ 5958 if (enable_vpid) { 5959 msrs->secondary_ctls_high |= 5960 SECONDARY_EXEC_ENABLE_VPID; 5961 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 5962 VMX_VPID_EXTENT_SUPPORTED_MASK; 5963 } 5964 5965 if (enable_unrestricted_guest) 5966 msrs->secondary_ctls_high |= 5967 SECONDARY_EXEC_UNRESTRICTED_GUEST; 5968 5969 if (flexpriority_enabled) 5970 msrs->secondary_ctls_high |= 5971 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 5972 5973 /* miscellaneous data */ 5974 rdmsr(MSR_IA32_VMX_MISC, 5975 msrs->misc_low, 5976 msrs->misc_high); 5977 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 5978 msrs->misc_low |= 5979 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 5980 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 5981 VMX_MISC_ACTIVITY_HLT; 5982 msrs->misc_high = 0; 5983 5984 /* 5985 * This MSR reports some information about VMX support. We 5986 * should return information about the VMX we emulate for the 5987 * guest, and the VMCS structure we give it - not about the 5988 * VMX support of the underlying hardware. 5989 */ 5990 msrs->basic = 5991 VMCS12_REVISION | 5992 VMX_BASIC_TRUE_CTLS | 5993 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 5994 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 5995 5996 if (cpu_has_vmx_basic_inout()) 5997 msrs->basic |= VMX_BASIC_INOUT; 5998 5999 /* 6000 * These MSRs specify bits which the guest must keep fixed on 6001 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 6002 * We picked the standard core2 setting. 6003 */ 6004 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 6005 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 6006 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 6007 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6008 6009 /* These MSRs specify bits which the guest must keep fixed off. */ 6010 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6011 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6012 6013 /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 6014 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; 6015 } 6016 6017 void nested_vmx_hardware_unsetup(void) 6018 { 6019 int i; 6020 6021 if (enable_shadow_vmcs) { 6022 for (i = 0; i < VMX_BITMAP_NR; i++) 6023 free_page((unsigned long)vmx_bitmap[i]); 6024 } 6025 } 6026 6027 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6028 { 6029 int i; 6030 6031 if (!cpu_has_vmx_shadow_vmcs()) 6032 enable_shadow_vmcs = 0; 6033 if (enable_shadow_vmcs) { 6034 for (i = 0; i < VMX_BITMAP_NR; i++) { 6035 /* 6036 * The vmx_bitmap is not tied to a VM and so should 6037 * not be charged to a memcg. 6038 */ 6039 vmx_bitmap[i] = (unsigned long *) 6040 __get_free_page(GFP_KERNEL); 6041 if (!vmx_bitmap[i]) { 6042 nested_vmx_hardware_unsetup(); 6043 return -ENOMEM; 6044 } 6045 } 6046 6047 init_vmcs_shadow_fields(); 6048 } 6049 6050 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, 6051 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 6052 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, 6053 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, 6054 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, 6055 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, 6056 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, 6057 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, 6058 exit_handlers[EXIT_REASON_VMON] = handle_vmon, 6059 exit_handlers[EXIT_REASON_INVEPT] = handle_invept, 6060 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, 6061 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, 6062 6063 kvm_x86_ops->check_nested_events = vmx_check_nested_events; 6064 kvm_x86_ops->get_nested_state = vmx_get_nested_state; 6065 kvm_x86_ops->set_nested_state = vmx_set_nested_state; 6066 kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, 6067 kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; 6068 kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; 6069 6070 return 0; 6071 } 6072