1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/frame.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "trace.h" 14 #include "x86.h" 15 16 static bool __read_mostly enable_shadow_vmcs = 1; 17 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 18 19 static bool __read_mostly nested_early_check = 0; 20 module_param(nested_early_check, bool, S_IRUGO); 21 22 #define CC(consistency_check) \ 23 ({ \ 24 bool failed = (consistency_check); \ 25 if (failed) \ 26 trace_kvm_nested_vmenter_failed(#consistency_check, 0); \ 27 failed; \ 28 }) 29 30 /* 31 * Hyper-V requires all of these, so mark them as supported even though 32 * they are just treated the same as all-context. 33 */ 34 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 35 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 36 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 39 40 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 41 42 enum { 43 VMX_VMREAD_BITMAP, 44 VMX_VMWRITE_BITMAP, 45 VMX_BITMAP_NR 46 }; 47 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 48 49 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 50 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 51 52 struct shadow_vmcs_field { 53 u16 encoding; 54 u16 offset; 55 }; 56 static struct shadow_vmcs_field shadow_read_only_fields[] = { 57 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 58 #include "vmcs_shadow_fields.h" 59 }; 60 static int max_shadow_read_only_fields = 61 ARRAY_SIZE(shadow_read_only_fields); 62 63 static struct shadow_vmcs_field shadow_read_write_fields[] = { 64 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 65 #include "vmcs_shadow_fields.h" 66 }; 67 static int max_shadow_read_write_fields = 68 ARRAY_SIZE(shadow_read_write_fields); 69 70 static void init_vmcs_shadow_fields(void) 71 { 72 int i, j; 73 74 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 75 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 76 77 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 78 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 79 u16 field = entry.encoding; 80 81 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 82 (i + 1 == max_shadow_read_only_fields || 83 shadow_read_only_fields[i + 1].encoding != field + 1)) 84 pr_err("Missing field from shadow_read_only_field %x\n", 85 field + 1); 86 87 clear_bit(field, vmx_vmread_bitmap); 88 if (field & 1) 89 #ifdef CONFIG_X86_64 90 continue; 91 #else 92 entry.offset += sizeof(u32); 93 #endif 94 shadow_read_only_fields[j++] = entry; 95 } 96 max_shadow_read_only_fields = j; 97 98 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 99 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 100 u16 field = entry.encoding; 101 102 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 103 (i + 1 == max_shadow_read_write_fields || 104 shadow_read_write_fields[i + 1].encoding != field + 1)) 105 pr_err("Missing field from shadow_read_write_field %x\n", 106 field + 1); 107 108 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 109 field <= GUEST_TR_AR_BYTES, 110 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 111 112 /* 113 * PML and the preemption timer can be emulated, but the 114 * processor cannot vmwrite to fields that don't exist 115 * on bare metal. 116 */ 117 switch (field) { 118 case GUEST_PML_INDEX: 119 if (!cpu_has_vmx_pml()) 120 continue; 121 break; 122 case VMX_PREEMPTION_TIMER_VALUE: 123 if (!cpu_has_vmx_preemption_timer()) 124 continue; 125 break; 126 case GUEST_INTR_STATUS: 127 if (!cpu_has_vmx_apicv()) 128 continue; 129 break; 130 default: 131 break; 132 } 133 134 clear_bit(field, vmx_vmwrite_bitmap); 135 clear_bit(field, vmx_vmread_bitmap); 136 if (field & 1) 137 #ifdef CONFIG_X86_64 138 continue; 139 #else 140 entry.offset += sizeof(u32); 141 #endif 142 shadow_read_write_fields[j++] = entry; 143 } 144 max_shadow_read_write_fields = j; 145 } 146 147 /* 148 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 149 * set the success or error code of an emulated VMX instruction (as specified 150 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 151 * instruction. 152 */ 153 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 154 { 155 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 156 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 157 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 158 return kvm_skip_emulated_instruction(vcpu); 159 } 160 161 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 162 { 163 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 164 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 165 X86_EFLAGS_SF | X86_EFLAGS_OF)) 166 | X86_EFLAGS_CF); 167 return kvm_skip_emulated_instruction(vcpu); 168 } 169 170 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 171 u32 vm_instruction_error) 172 { 173 struct vcpu_vmx *vmx = to_vmx(vcpu); 174 175 /* 176 * failValid writes the error number to the current VMCS, which 177 * can't be done if there isn't a current VMCS. 178 */ 179 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) 180 return nested_vmx_failInvalid(vcpu); 181 182 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 183 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 184 X86_EFLAGS_SF | X86_EFLAGS_OF)) 185 | X86_EFLAGS_ZF); 186 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 187 /* 188 * We don't need to force a shadow sync because 189 * VM_INSTRUCTION_ERROR is not shadowed 190 */ 191 return kvm_skip_emulated_instruction(vcpu); 192 } 193 194 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 195 { 196 /* TODO: not to reset guest simply here. */ 197 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 198 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 199 } 200 201 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 202 { 203 return fixed_bits_valid(control, low, high); 204 } 205 206 static inline u64 vmx_control_msr(u32 low, u32 high) 207 { 208 return low | ((u64)high << 32); 209 } 210 211 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 212 { 213 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 214 vmcs_write64(VMCS_LINK_POINTER, -1ull); 215 vmx->nested.need_vmcs12_to_shadow_sync = false; 216 } 217 218 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 219 { 220 struct vcpu_vmx *vmx = to_vmx(vcpu); 221 222 if (!vmx->nested.hv_evmcs) 223 return; 224 225 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 226 vmx->nested.hv_evmcs_vmptr = -1ull; 227 vmx->nested.hv_evmcs = NULL; 228 } 229 230 /* 231 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 232 * just stops using VMX. 233 */ 234 static void free_nested(struct kvm_vcpu *vcpu) 235 { 236 struct vcpu_vmx *vmx = to_vmx(vcpu); 237 238 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 239 return; 240 241 kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 242 243 vmx->nested.vmxon = false; 244 vmx->nested.smm.vmxon = false; 245 free_vpid(vmx->nested.vpid02); 246 vmx->nested.posted_intr_nv = -1; 247 vmx->nested.current_vmptr = -1ull; 248 if (enable_shadow_vmcs) { 249 vmx_disable_shadow_vmcs(vmx); 250 vmcs_clear(vmx->vmcs01.shadow_vmcs); 251 free_vmcs(vmx->vmcs01.shadow_vmcs); 252 vmx->vmcs01.shadow_vmcs = NULL; 253 } 254 kfree(vmx->nested.cached_vmcs12); 255 vmx->nested.cached_vmcs12 = NULL; 256 kfree(vmx->nested.cached_shadow_vmcs12); 257 vmx->nested.cached_shadow_vmcs12 = NULL; 258 /* Unpin physical memory we referred to in the vmcs02 */ 259 if (vmx->nested.apic_access_page) { 260 kvm_release_page_dirty(vmx->nested.apic_access_page); 261 vmx->nested.apic_access_page = NULL; 262 } 263 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 264 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 265 vmx->nested.pi_desc = NULL; 266 267 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 268 269 nested_release_evmcs(vcpu); 270 271 free_loaded_vmcs(&vmx->nested.vmcs02); 272 } 273 274 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 275 struct loaded_vmcs *prev) 276 { 277 struct vmcs_host_state *dest, *src; 278 279 if (unlikely(!vmx->guest_state_loaded)) 280 return; 281 282 src = &prev->host_state; 283 dest = &vmx->loaded_vmcs->host_state; 284 285 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 286 dest->ldt_sel = src->ldt_sel; 287 #ifdef CONFIG_X86_64 288 dest->ds_sel = src->ds_sel; 289 dest->es_sel = src->es_sel; 290 #endif 291 } 292 293 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 294 { 295 struct vcpu_vmx *vmx = to_vmx(vcpu); 296 struct loaded_vmcs *prev; 297 int cpu; 298 299 if (vmx->loaded_vmcs == vmcs) 300 return; 301 302 cpu = get_cpu(); 303 prev = vmx->loaded_vmcs; 304 vmx->loaded_vmcs = vmcs; 305 vmx_vcpu_load_vmcs(vcpu, cpu); 306 vmx_sync_vmcs_host_state(vmx, prev); 307 put_cpu(); 308 309 vmx_segment_cache_clear(vmx); 310 } 311 312 /* 313 * Ensure that the current vmcs of the logical processor is the 314 * vmcs01 of the vcpu before calling free_nested(). 315 */ 316 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 317 { 318 vcpu_load(vcpu); 319 vmx_leave_nested(vcpu); 320 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); 321 free_nested(vcpu); 322 vcpu_put(vcpu); 323 } 324 325 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 326 struct x86_exception *fault) 327 { 328 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 329 struct vcpu_vmx *vmx = to_vmx(vcpu); 330 u32 exit_reason; 331 unsigned long exit_qualification = vcpu->arch.exit_qualification; 332 333 if (vmx->nested.pml_full) { 334 exit_reason = EXIT_REASON_PML_FULL; 335 vmx->nested.pml_full = false; 336 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 337 } else if (fault->error_code & PFERR_RSVD_MASK) 338 exit_reason = EXIT_REASON_EPT_MISCONFIG; 339 else 340 exit_reason = EXIT_REASON_EPT_VIOLATION; 341 342 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); 343 vmcs12->guest_physical_address = fault->address; 344 } 345 346 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 347 { 348 WARN_ON(mmu_is_nested(vcpu)); 349 350 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 351 kvm_init_shadow_ept_mmu(vcpu, 352 to_vmx(vcpu)->nested.msrs.ept_caps & 353 VMX_EPT_EXECUTE_ONLY_BIT, 354 nested_ept_ad_enabled(vcpu), 355 nested_ept_get_cr3(vcpu)); 356 vcpu->arch.mmu->set_cr3 = vmx_set_cr3; 357 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; 358 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 359 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 360 361 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 362 } 363 364 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 365 { 366 vcpu->arch.mmu = &vcpu->arch.root_mmu; 367 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 368 } 369 370 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 371 u16 error_code) 372 { 373 bool inequality, bit; 374 375 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 376 inequality = 377 (error_code & vmcs12->page_fault_error_code_mask) != 378 vmcs12->page_fault_error_code_match; 379 return inequality ^ bit; 380 } 381 382 383 /* 384 * KVM wants to inject page-faults which it got to the guest. This function 385 * checks whether in a nested guest, we need to inject them to L1 or L2. 386 */ 387 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 388 { 389 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 390 unsigned int nr = vcpu->arch.exception.nr; 391 bool has_payload = vcpu->arch.exception.has_payload; 392 unsigned long payload = vcpu->arch.exception.payload; 393 394 if (nr == PF_VECTOR) { 395 if (vcpu->arch.exception.nested_apf) { 396 *exit_qual = vcpu->arch.apf.nested_apf_token; 397 return 1; 398 } 399 if (nested_vmx_is_page_fault_vmexit(vmcs12, 400 vcpu->arch.exception.error_code)) { 401 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 402 return 1; 403 } 404 } else if (vmcs12->exception_bitmap & (1u << nr)) { 405 if (nr == DB_VECTOR) { 406 if (!has_payload) { 407 payload = vcpu->arch.dr6; 408 payload &= ~(DR6_FIXED_1 | DR6_BT); 409 payload ^= DR6_RTM; 410 } 411 *exit_qual = payload; 412 } else 413 *exit_qual = 0; 414 return 1; 415 } 416 417 return 0; 418 } 419 420 421 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 422 struct x86_exception *fault) 423 { 424 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 425 426 WARN_ON(!is_guest_mode(vcpu)); 427 428 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 429 !to_vmx(vcpu)->nested.nested_run_pending) { 430 vmcs12->vm_exit_intr_error_code = fault->error_code; 431 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 432 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 433 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 434 fault->address); 435 } else { 436 kvm_inject_page_fault(vcpu, fault); 437 } 438 } 439 440 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) 441 { 442 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); 443 } 444 445 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 446 struct vmcs12 *vmcs12) 447 { 448 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 449 return 0; 450 451 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 452 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 453 return -EINVAL; 454 455 return 0; 456 } 457 458 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 459 struct vmcs12 *vmcs12) 460 { 461 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 462 return 0; 463 464 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 465 return -EINVAL; 466 467 return 0; 468 } 469 470 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 471 struct vmcs12 *vmcs12) 472 { 473 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 474 return 0; 475 476 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 477 return -EINVAL; 478 479 return 0; 480 } 481 482 /* 483 * Check if MSR is intercepted for L01 MSR bitmap. 484 */ 485 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) 486 { 487 unsigned long *msr_bitmap; 488 int f = sizeof(unsigned long); 489 490 if (!cpu_has_vmx_msr_bitmap()) 491 return true; 492 493 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 494 495 if (msr <= 0x1fff) { 496 return !!test_bit(msr, msr_bitmap + 0x800 / f); 497 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 498 msr &= 0x1fff; 499 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 500 } 501 502 return true; 503 } 504 505 /* 506 * If a msr is allowed by L0, we should check whether it is allowed by L1. 507 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 508 */ 509 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 510 unsigned long *msr_bitmap_nested, 511 u32 msr, int type) 512 { 513 int f = sizeof(unsigned long); 514 515 /* 516 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 517 * have the write-low and read-high bitmap offsets the wrong way round. 518 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 519 */ 520 if (msr <= 0x1fff) { 521 if (type & MSR_TYPE_R && 522 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 523 /* read-low */ 524 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 525 526 if (type & MSR_TYPE_W && 527 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 528 /* write-low */ 529 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 530 531 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 532 msr &= 0x1fff; 533 if (type & MSR_TYPE_R && 534 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 535 /* read-high */ 536 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 537 538 if (type & MSR_TYPE_W && 539 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 540 /* write-high */ 541 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 542 543 } 544 } 545 546 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { 547 int msr; 548 549 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 550 unsigned word = msr / BITS_PER_LONG; 551 552 msr_bitmap[word] = ~0; 553 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 554 } 555 } 556 557 /* 558 * Merge L0's and L1's MSR bitmap, return false to indicate that 559 * we do not use the hardware. 560 */ 561 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 562 struct vmcs12 *vmcs12) 563 { 564 int msr; 565 unsigned long *msr_bitmap_l1; 566 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 567 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; 568 569 /* Nothing to do if the MSR bitmap is not in use. */ 570 if (!cpu_has_vmx_msr_bitmap() || 571 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 572 return false; 573 574 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 575 return false; 576 577 msr_bitmap_l1 = (unsigned long *)map->hva; 578 579 /* 580 * To keep the control flow simple, pay eight 8-byte writes (sixteen 581 * 4-byte writes on 32-bit systems) up front to enable intercepts for 582 * the x2APIC MSR range and selectively disable them below. 583 */ 584 enable_x2apic_msr_intercepts(msr_bitmap_l0); 585 586 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 587 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 588 /* 589 * L0 need not intercept reads for MSRs between 0x800 590 * and 0x8ff, it just lets the processor take the value 591 * from the virtual-APIC page; take those 256 bits 592 * directly from the L1 bitmap. 593 */ 594 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 595 unsigned word = msr / BITS_PER_LONG; 596 597 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 598 } 599 } 600 601 nested_vmx_disable_intercept_for_msr( 602 msr_bitmap_l1, msr_bitmap_l0, 603 X2APIC_MSR(APIC_TASKPRI), 604 MSR_TYPE_R | MSR_TYPE_W); 605 606 if (nested_cpu_has_vid(vmcs12)) { 607 nested_vmx_disable_intercept_for_msr( 608 msr_bitmap_l1, msr_bitmap_l0, 609 X2APIC_MSR(APIC_EOI), 610 MSR_TYPE_W); 611 nested_vmx_disable_intercept_for_msr( 612 msr_bitmap_l1, msr_bitmap_l0, 613 X2APIC_MSR(APIC_SELF_IPI), 614 MSR_TYPE_W); 615 } 616 } 617 618 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ 619 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 620 MSR_FS_BASE, MSR_TYPE_RW); 621 622 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 623 MSR_GS_BASE, MSR_TYPE_RW); 624 625 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 626 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 627 628 /* 629 * Checking the L0->L1 bitmap is trying to verify two things: 630 * 631 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This 632 * ensures that we do not accidentally generate an L02 MSR bitmap 633 * from the L12 MSR bitmap that is too permissive. 634 * 2. That L1 or L2s have actually used the MSR. This avoids 635 * unnecessarily merging of the bitmap if the MSR is unused. This 636 * works properly because we only update the L01 MSR bitmap lazily. 637 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only 638 * updated to reflect this when L1 (or its L2s) actually write to 639 * the MSR. 640 */ 641 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) 642 nested_vmx_disable_intercept_for_msr( 643 msr_bitmap_l1, msr_bitmap_l0, 644 MSR_IA32_SPEC_CTRL, 645 MSR_TYPE_R | MSR_TYPE_W); 646 647 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) 648 nested_vmx_disable_intercept_for_msr( 649 msr_bitmap_l1, msr_bitmap_l0, 650 MSR_IA32_PRED_CMD, 651 MSR_TYPE_W); 652 653 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); 654 655 return true; 656 } 657 658 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 659 struct vmcs12 *vmcs12) 660 { 661 struct kvm_host_map map; 662 struct vmcs12 *shadow; 663 664 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 665 vmcs12->vmcs_link_pointer == -1ull) 666 return; 667 668 shadow = get_shadow_vmcs12(vcpu); 669 670 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 671 return; 672 673 memcpy(shadow, map.hva, VMCS12_SIZE); 674 kvm_vcpu_unmap(vcpu, &map, false); 675 } 676 677 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 678 struct vmcs12 *vmcs12) 679 { 680 struct vcpu_vmx *vmx = to_vmx(vcpu); 681 682 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 683 vmcs12->vmcs_link_pointer == -1ull) 684 return; 685 686 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 687 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 688 } 689 690 /* 691 * In nested virtualization, check if L1 has set 692 * VM_EXIT_ACK_INTR_ON_EXIT 693 */ 694 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 695 { 696 return get_vmcs12(vcpu)->vm_exit_controls & 697 VM_EXIT_ACK_INTR_ON_EXIT; 698 } 699 700 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 701 { 702 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); 703 } 704 705 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 706 struct vmcs12 *vmcs12) 707 { 708 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 709 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 710 return -EINVAL; 711 else 712 return 0; 713 } 714 715 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 716 struct vmcs12 *vmcs12) 717 { 718 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 719 !nested_cpu_has_apic_reg_virt(vmcs12) && 720 !nested_cpu_has_vid(vmcs12) && 721 !nested_cpu_has_posted_intr(vmcs12)) 722 return 0; 723 724 /* 725 * If virtualize x2apic mode is enabled, 726 * virtualize apic access must be disabled. 727 */ 728 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 729 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 730 return -EINVAL; 731 732 /* 733 * If virtual interrupt delivery is enabled, 734 * we must exit on external interrupts. 735 */ 736 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 737 return -EINVAL; 738 739 /* 740 * bits 15:8 should be zero in posted_intr_nv, 741 * the descriptor address has been already checked 742 * in nested_get_vmcs12_pages. 743 * 744 * bits 5:0 of posted_intr_desc_addr should be zero. 745 */ 746 if (nested_cpu_has_posted_intr(vmcs12) && 747 (CC(!nested_cpu_has_vid(vmcs12)) || 748 CC(!nested_exit_intr_ack_set(vcpu)) || 749 CC((vmcs12->posted_intr_nv & 0xff00)) || 750 CC((vmcs12->posted_intr_desc_addr & 0x3f)) || 751 CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))) 752 return -EINVAL; 753 754 /* tpr shadow is needed by all apicv features. */ 755 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 756 return -EINVAL; 757 758 return 0; 759 } 760 761 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 762 u32 count, u64 addr) 763 { 764 int maxphyaddr; 765 766 if (count == 0) 767 return 0; 768 maxphyaddr = cpuid_maxphyaddr(vcpu); 769 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 770 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) 771 return -EINVAL; 772 773 return 0; 774 } 775 776 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 777 struct vmcs12 *vmcs12) 778 { 779 if (CC(nested_vmx_check_msr_switch(vcpu, 780 vmcs12->vm_exit_msr_load_count, 781 vmcs12->vm_exit_msr_load_addr)) || 782 CC(nested_vmx_check_msr_switch(vcpu, 783 vmcs12->vm_exit_msr_store_count, 784 vmcs12->vm_exit_msr_store_addr))) 785 return -EINVAL; 786 787 return 0; 788 } 789 790 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 791 struct vmcs12 *vmcs12) 792 { 793 if (CC(nested_vmx_check_msr_switch(vcpu, 794 vmcs12->vm_entry_msr_load_count, 795 vmcs12->vm_entry_msr_load_addr))) 796 return -EINVAL; 797 798 return 0; 799 } 800 801 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 802 struct vmcs12 *vmcs12) 803 { 804 if (!nested_cpu_has_pml(vmcs12)) 805 return 0; 806 807 if (CC(!nested_cpu_has_ept(vmcs12)) || 808 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 809 return -EINVAL; 810 811 return 0; 812 } 813 814 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 815 struct vmcs12 *vmcs12) 816 { 817 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 818 !nested_cpu_has_ept(vmcs12))) 819 return -EINVAL; 820 return 0; 821 } 822 823 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 824 struct vmcs12 *vmcs12) 825 { 826 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 827 !nested_cpu_has_ept(vmcs12))) 828 return -EINVAL; 829 return 0; 830 } 831 832 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 833 struct vmcs12 *vmcs12) 834 { 835 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 836 return 0; 837 838 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 839 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 840 return -EINVAL; 841 842 return 0; 843 } 844 845 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 846 struct vmx_msr_entry *e) 847 { 848 /* x2APIC MSR accesses are not allowed */ 849 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 850 return -EINVAL; 851 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 852 CC(e->index == MSR_IA32_UCODE_REV)) 853 return -EINVAL; 854 if (CC(e->reserved != 0)) 855 return -EINVAL; 856 return 0; 857 } 858 859 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 860 struct vmx_msr_entry *e) 861 { 862 if (CC(e->index == MSR_FS_BASE) || 863 CC(e->index == MSR_GS_BASE) || 864 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 865 nested_vmx_msr_check_common(vcpu, e)) 866 return -EINVAL; 867 return 0; 868 } 869 870 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 871 struct vmx_msr_entry *e) 872 { 873 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 874 nested_vmx_msr_check_common(vcpu, e)) 875 return -EINVAL; 876 return 0; 877 } 878 879 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 880 { 881 struct vcpu_vmx *vmx = to_vmx(vcpu); 882 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 883 vmx->nested.msrs.misc_high); 884 885 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 886 } 887 888 /* 889 * Load guest's/host's msr at nested entry/exit. 890 * return 0 for success, entry index for failure. 891 * 892 * One of the failure modes for MSR load/store is when a list exceeds the 893 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 894 * as possible, process all valid entries before failing rather than precheck 895 * for a capacity violation. 896 */ 897 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 898 { 899 u32 i; 900 struct vmx_msr_entry e; 901 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 902 903 for (i = 0; i < count; i++) { 904 if (unlikely(i >= max_msr_list_size)) 905 goto fail; 906 907 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 908 &e, sizeof(e))) { 909 pr_debug_ratelimited( 910 "%s cannot read MSR entry (%u, 0x%08llx)\n", 911 __func__, i, gpa + i * sizeof(e)); 912 goto fail; 913 } 914 if (nested_vmx_load_msr_check(vcpu, &e)) { 915 pr_debug_ratelimited( 916 "%s check failed (%u, 0x%x, 0x%x)\n", 917 __func__, i, e.index, e.reserved); 918 goto fail; 919 } 920 if (kvm_set_msr(vcpu, e.index, e.value)) { 921 pr_debug_ratelimited( 922 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 923 __func__, i, e.index, e.value); 924 goto fail; 925 } 926 } 927 return 0; 928 fail: 929 return i + 1; 930 } 931 932 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 933 { 934 u64 data; 935 u32 i; 936 struct vmx_msr_entry e; 937 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 938 939 for (i = 0; i < count; i++) { 940 if (unlikely(i >= max_msr_list_size)) 941 return -EINVAL; 942 943 if (kvm_vcpu_read_guest(vcpu, 944 gpa + i * sizeof(e), 945 &e, 2 * sizeof(u32))) { 946 pr_debug_ratelimited( 947 "%s cannot read MSR entry (%u, 0x%08llx)\n", 948 __func__, i, gpa + i * sizeof(e)); 949 return -EINVAL; 950 } 951 if (nested_vmx_store_msr_check(vcpu, &e)) { 952 pr_debug_ratelimited( 953 "%s check failed (%u, 0x%x, 0x%x)\n", 954 __func__, i, e.index, e.reserved); 955 return -EINVAL; 956 } 957 if (kvm_get_msr(vcpu, e.index, &data)) { 958 pr_debug_ratelimited( 959 "%s cannot read MSR (%u, 0x%x)\n", 960 __func__, i, e.index); 961 return -EINVAL; 962 } 963 if (kvm_vcpu_write_guest(vcpu, 964 gpa + i * sizeof(e) + 965 offsetof(struct vmx_msr_entry, value), 966 &data, sizeof(data))) { 967 pr_debug_ratelimited( 968 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 969 __func__, i, e.index, data); 970 return -EINVAL; 971 } 972 } 973 return 0; 974 } 975 976 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) 977 { 978 unsigned long invalid_mask; 979 980 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); 981 return (val & invalid_mask) == 0; 982 } 983 984 /* 985 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are 986 * emulating VM entry into a guest with EPT enabled. 987 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 988 * is assigned to entry_failure_code on failure. 989 */ 990 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, 991 u32 *entry_failure_code) 992 { 993 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { 994 if (CC(!nested_cr3_valid(vcpu, cr3))) { 995 *entry_failure_code = ENTRY_FAIL_DEFAULT; 996 return -EINVAL; 997 } 998 999 /* 1000 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1001 * must not be dereferenced. 1002 */ 1003 if (is_pae_paging(vcpu) && !nested_ept) { 1004 if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) { 1005 *entry_failure_code = ENTRY_FAIL_PDPTE; 1006 return -EINVAL; 1007 } 1008 } 1009 } 1010 1011 if (!nested_ept) 1012 kvm_mmu_new_cr3(vcpu, cr3, false); 1013 1014 vcpu->arch.cr3 = cr3; 1015 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 1016 1017 kvm_init_mmu(vcpu, false); 1018 1019 return 0; 1020 } 1021 1022 /* 1023 * Returns if KVM is able to config CPU to tag TLB entries 1024 * populated by L2 differently than TLB entries populated 1025 * by L1. 1026 * 1027 * If L1 uses EPT, then TLB entries are tagged with different EPTP. 1028 * 1029 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1030 * with different VPID (L1 entries are tagged with vmx->vpid 1031 * while L2 entries are tagged with vmx->nested.vpid02). 1032 */ 1033 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1034 { 1035 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1036 1037 return nested_cpu_has_ept(vmcs12) || 1038 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1039 } 1040 1041 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) 1042 { 1043 struct vcpu_vmx *vmx = to_vmx(vcpu); 1044 1045 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; 1046 } 1047 1048 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1049 { 1050 superset &= mask; 1051 subset &= mask; 1052 1053 return (superset | subset) == superset; 1054 } 1055 1056 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1057 { 1058 const u64 feature_and_reserved = 1059 /* feature (except bit 48; see below) */ 1060 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1061 /* reserved */ 1062 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1063 u64 vmx_basic = vmx->nested.msrs.basic; 1064 1065 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1066 return -EINVAL; 1067 1068 /* 1069 * KVM does not emulate a version of VMX that constrains physical 1070 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1071 */ 1072 if (data & BIT_ULL(48)) 1073 return -EINVAL; 1074 1075 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1076 vmx_basic_vmcs_revision_id(data)) 1077 return -EINVAL; 1078 1079 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1080 return -EINVAL; 1081 1082 vmx->nested.msrs.basic = data; 1083 return 0; 1084 } 1085 1086 static int 1087 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1088 { 1089 u64 supported; 1090 u32 *lowp, *highp; 1091 1092 switch (msr_index) { 1093 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1094 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1095 highp = &vmx->nested.msrs.pinbased_ctls_high; 1096 break; 1097 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1098 lowp = &vmx->nested.msrs.procbased_ctls_low; 1099 highp = &vmx->nested.msrs.procbased_ctls_high; 1100 break; 1101 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1102 lowp = &vmx->nested.msrs.exit_ctls_low; 1103 highp = &vmx->nested.msrs.exit_ctls_high; 1104 break; 1105 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1106 lowp = &vmx->nested.msrs.entry_ctls_low; 1107 highp = &vmx->nested.msrs.entry_ctls_high; 1108 break; 1109 case MSR_IA32_VMX_PROCBASED_CTLS2: 1110 lowp = &vmx->nested.msrs.secondary_ctls_low; 1111 highp = &vmx->nested.msrs.secondary_ctls_high; 1112 break; 1113 default: 1114 BUG(); 1115 } 1116 1117 supported = vmx_control_msr(*lowp, *highp); 1118 1119 /* Check must-be-1 bits are still 1. */ 1120 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1121 return -EINVAL; 1122 1123 /* Check must-be-0 bits are still 0. */ 1124 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1125 return -EINVAL; 1126 1127 *lowp = data; 1128 *highp = data >> 32; 1129 return 0; 1130 } 1131 1132 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1133 { 1134 const u64 feature_and_reserved_bits = 1135 /* feature */ 1136 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1137 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1138 /* reserved */ 1139 GENMASK_ULL(13, 9) | BIT_ULL(31); 1140 u64 vmx_misc; 1141 1142 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1143 vmx->nested.msrs.misc_high); 1144 1145 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1146 return -EINVAL; 1147 1148 if ((vmx->nested.msrs.pinbased_ctls_high & 1149 PIN_BASED_VMX_PREEMPTION_TIMER) && 1150 vmx_misc_preemption_timer_rate(data) != 1151 vmx_misc_preemption_timer_rate(vmx_misc)) 1152 return -EINVAL; 1153 1154 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1155 return -EINVAL; 1156 1157 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1158 return -EINVAL; 1159 1160 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1161 return -EINVAL; 1162 1163 vmx->nested.msrs.misc_low = data; 1164 vmx->nested.msrs.misc_high = data >> 32; 1165 1166 return 0; 1167 } 1168 1169 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1170 { 1171 u64 vmx_ept_vpid_cap; 1172 1173 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1174 vmx->nested.msrs.vpid_caps); 1175 1176 /* Every bit is either reserved or a feature bit. */ 1177 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1178 return -EINVAL; 1179 1180 vmx->nested.msrs.ept_caps = data; 1181 vmx->nested.msrs.vpid_caps = data >> 32; 1182 return 0; 1183 } 1184 1185 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1186 { 1187 u64 *msr; 1188 1189 switch (msr_index) { 1190 case MSR_IA32_VMX_CR0_FIXED0: 1191 msr = &vmx->nested.msrs.cr0_fixed0; 1192 break; 1193 case MSR_IA32_VMX_CR4_FIXED0: 1194 msr = &vmx->nested.msrs.cr4_fixed0; 1195 break; 1196 default: 1197 BUG(); 1198 } 1199 1200 /* 1201 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1202 * must be 1 in the restored value. 1203 */ 1204 if (!is_bitwise_subset(data, *msr, -1ULL)) 1205 return -EINVAL; 1206 1207 *msr = data; 1208 return 0; 1209 } 1210 1211 /* 1212 * Called when userspace is restoring VMX MSRs. 1213 * 1214 * Returns 0 on success, non-0 otherwise. 1215 */ 1216 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1217 { 1218 struct vcpu_vmx *vmx = to_vmx(vcpu); 1219 1220 /* 1221 * Don't allow changes to the VMX capability MSRs while the vCPU 1222 * is in VMX operation. 1223 */ 1224 if (vmx->nested.vmxon) 1225 return -EBUSY; 1226 1227 switch (msr_index) { 1228 case MSR_IA32_VMX_BASIC: 1229 return vmx_restore_vmx_basic(vmx, data); 1230 case MSR_IA32_VMX_PINBASED_CTLS: 1231 case MSR_IA32_VMX_PROCBASED_CTLS: 1232 case MSR_IA32_VMX_EXIT_CTLS: 1233 case MSR_IA32_VMX_ENTRY_CTLS: 1234 /* 1235 * The "non-true" VMX capability MSRs are generated from the 1236 * "true" MSRs, so we do not support restoring them directly. 1237 * 1238 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1239 * should restore the "true" MSRs with the must-be-1 bits 1240 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1241 * DEFAULT SETTINGS". 1242 */ 1243 return -EINVAL; 1244 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1245 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1246 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1247 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1248 case MSR_IA32_VMX_PROCBASED_CTLS2: 1249 return vmx_restore_control_msr(vmx, msr_index, data); 1250 case MSR_IA32_VMX_MISC: 1251 return vmx_restore_vmx_misc(vmx, data); 1252 case MSR_IA32_VMX_CR0_FIXED0: 1253 case MSR_IA32_VMX_CR4_FIXED0: 1254 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1255 case MSR_IA32_VMX_CR0_FIXED1: 1256 case MSR_IA32_VMX_CR4_FIXED1: 1257 /* 1258 * These MSRs are generated based on the vCPU's CPUID, so we 1259 * do not support restoring them directly. 1260 */ 1261 return -EINVAL; 1262 case MSR_IA32_VMX_EPT_VPID_CAP: 1263 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1264 case MSR_IA32_VMX_VMCS_ENUM: 1265 vmx->nested.msrs.vmcs_enum = data; 1266 return 0; 1267 case MSR_IA32_VMX_VMFUNC: 1268 if (data & ~vmx->nested.msrs.vmfunc_controls) 1269 return -EINVAL; 1270 vmx->nested.msrs.vmfunc_controls = data; 1271 return 0; 1272 default: 1273 /* 1274 * The rest of the VMX capability MSRs do not support restore. 1275 */ 1276 return -EINVAL; 1277 } 1278 } 1279 1280 /* Returns 0 on success, non-0 otherwise. */ 1281 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1282 { 1283 switch (msr_index) { 1284 case MSR_IA32_VMX_BASIC: 1285 *pdata = msrs->basic; 1286 break; 1287 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1288 case MSR_IA32_VMX_PINBASED_CTLS: 1289 *pdata = vmx_control_msr( 1290 msrs->pinbased_ctls_low, 1291 msrs->pinbased_ctls_high); 1292 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1293 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1294 break; 1295 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1296 case MSR_IA32_VMX_PROCBASED_CTLS: 1297 *pdata = vmx_control_msr( 1298 msrs->procbased_ctls_low, 1299 msrs->procbased_ctls_high); 1300 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1301 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1302 break; 1303 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1304 case MSR_IA32_VMX_EXIT_CTLS: 1305 *pdata = vmx_control_msr( 1306 msrs->exit_ctls_low, 1307 msrs->exit_ctls_high); 1308 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1309 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1310 break; 1311 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1312 case MSR_IA32_VMX_ENTRY_CTLS: 1313 *pdata = vmx_control_msr( 1314 msrs->entry_ctls_low, 1315 msrs->entry_ctls_high); 1316 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1317 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1318 break; 1319 case MSR_IA32_VMX_MISC: 1320 *pdata = vmx_control_msr( 1321 msrs->misc_low, 1322 msrs->misc_high); 1323 break; 1324 case MSR_IA32_VMX_CR0_FIXED0: 1325 *pdata = msrs->cr0_fixed0; 1326 break; 1327 case MSR_IA32_VMX_CR0_FIXED1: 1328 *pdata = msrs->cr0_fixed1; 1329 break; 1330 case MSR_IA32_VMX_CR4_FIXED0: 1331 *pdata = msrs->cr4_fixed0; 1332 break; 1333 case MSR_IA32_VMX_CR4_FIXED1: 1334 *pdata = msrs->cr4_fixed1; 1335 break; 1336 case MSR_IA32_VMX_VMCS_ENUM: 1337 *pdata = msrs->vmcs_enum; 1338 break; 1339 case MSR_IA32_VMX_PROCBASED_CTLS2: 1340 *pdata = vmx_control_msr( 1341 msrs->secondary_ctls_low, 1342 msrs->secondary_ctls_high); 1343 break; 1344 case MSR_IA32_VMX_EPT_VPID_CAP: 1345 *pdata = msrs->ept_caps | 1346 ((u64)msrs->vpid_caps << 32); 1347 break; 1348 case MSR_IA32_VMX_VMFUNC: 1349 *pdata = msrs->vmfunc_controls; 1350 break; 1351 default: 1352 return 1; 1353 } 1354 1355 return 0; 1356 } 1357 1358 /* 1359 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1360 * been modified by the L1 guest. Note, "writable" in this context means 1361 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1362 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1363 * VM-exit information fields (which are actually writable if the vCPU is 1364 * configured to support "VMWRITE to any supported field in the VMCS"). 1365 */ 1366 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1367 { 1368 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1369 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1370 struct shadow_vmcs_field field; 1371 unsigned long val; 1372 int i; 1373 1374 if (WARN_ON(!shadow_vmcs)) 1375 return; 1376 1377 preempt_disable(); 1378 1379 vmcs_load(shadow_vmcs); 1380 1381 for (i = 0; i < max_shadow_read_write_fields; i++) { 1382 field = shadow_read_write_fields[i]; 1383 val = __vmcs_readl(field.encoding); 1384 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1385 } 1386 1387 vmcs_clear(shadow_vmcs); 1388 vmcs_load(vmx->loaded_vmcs->vmcs); 1389 1390 preempt_enable(); 1391 } 1392 1393 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1394 { 1395 const struct shadow_vmcs_field *fields[] = { 1396 shadow_read_write_fields, 1397 shadow_read_only_fields 1398 }; 1399 const int max_fields[] = { 1400 max_shadow_read_write_fields, 1401 max_shadow_read_only_fields 1402 }; 1403 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1404 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1405 struct shadow_vmcs_field field; 1406 unsigned long val; 1407 int i, q; 1408 1409 if (WARN_ON(!shadow_vmcs)) 1410 return; 1411 1412 vmcs_load(shadow_vmcs); 1413 1414 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1415 for (i = 0; i < max_fields[q]; i++) { 1416 field = fields[q][i]; 1417 val = vmcs12_read_any(vmcs12, field.encoding, 1418 field.offset); 1419 __vmcs_writel(field.encoding, val); 1420 } 1421 } 1422 1423 vmcs_clear(shadow_vmcs); 1424 vmcs_load(vmx->loaded_vmcs->vmcs); 1425 } 1426 1427 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) 1428 { 1429 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1430 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1431 1432 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1433 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1434 vmcs12->guest_rip = evmcs->guest_rip; 1435 1436 if (unlikely(!(evmcs->hv_clean_fields & 1437 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1438 vmcs12->guest_rsp = evmcs->guest_rsp; 1439 vmcs12->guest_rflags = evmcs->guest_rflags; 1440 vmcs12->guest_interruptibility_info = 1441 evmcs->guest_interruptibility_info; 1442 } 1443 1444 if (unlikely(!(evmcs->hv_clean_fields & 1445 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1446 vmcs12->cpu_based_vm_exec_control = 1447 evmcs->cpu_based_vm_exec_control; 1448 } 1449 1450 if (unlikely(!(evmcs->hv_clean_fields & 1451 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1452 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1453 } 1454 1455 if (unlikely(!(evmcs->hv_clean_fields & 1456 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1457 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1458 } 1459 1460 if (unlikely(!(evmcs->hv_clean_fields & 1461 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1462 vmcs12->vm_entry_intr_info_field = 1463 evmcs->vm_entry_intr_info_field; 1464 vmcs12->vm_entry_exception_error_code = 1465 evmcs->vm_entry_exception_error_code; 1466 vmcs12->vm_entry_instruction_len = 1467 evmcs->vm_entry_instruction_len; 1468 } 1469 1470 if (unlikely(!(evmcs->hv_clean_fields & 1471 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1472 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1473 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1474 vmcs12->host_cr0 = evmcs->host_cr0; 1475 vmcs12->host_cr3 = evmcs->host_cr3; 1476 vmcs12->host_cr4 = evmcs->host_cr4; 1477 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1478 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1479 vmcs12->host_rip = evmcs->host_rip; 1480 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1481 vmcs12->host_es_selector = evmcs->host_es_selector; 1482 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1483 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1484 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1485 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1486 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1487 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1488 } 1489 1490 if (unlikely(!(evmcs->hv_clean_fields & 1491 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1492 vmcs12->pin_based_vm_exec_control = 1493 evmcs->pin_based_vm_exec_control; 1494 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1495 vmcs12->secondary_vm_exec_control = 1496 evmcs->secondary_vm_exec_control; 1497 } 1498 1499 if (unlikely(!(evmcs->hv_clean_fields & 1500 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1501 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1502 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1503 } 1504 1505 if (unlikely(!(evmcs->hv_clean_fields & 1506 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1507 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1508 } 1509 1510 if (unlikely(!(evmcs->hv_clean_fields & 1511 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1512 vmcs12->guest_es_base = evmcs->guest_es_base; 1513 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1514 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1515 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1516 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1517 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1518 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1519 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1520 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1521 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1522 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1523 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1524 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1525 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1526 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1527 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1528 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1529 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1530 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1531 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1532 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1533 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1534 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1535 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1536 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1537 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1538 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1539 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1540 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1541 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1542 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1543 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1544 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1545 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1546 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1547 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1548 } 1549 1550 if (unlikely(!(evmcs->hv_clean_fields & 1551 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1552 vmcs12->tsc_offset = evmcs->tsc_offset; 1553 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1554 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1555 } 1556 1557 if (unlikely(!(evmcs->hv_clean_fields & 1558 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1559 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1560 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1561 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1562 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1563 vmcs12->guest_cr0 = evmcs->guest_cr0; 1564 vmcs12->guest_cr3 = evmcs->guest_cr3; 1565 vmcs12->guest_cr4 = evmcs->guest_cr4; 1566 vmcs12->guest_dr7 = evmcs->guest_dr7; 1567 } 1568 1569 if (unlikely(!(evmcs->hv_clean_fields & 1570 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1571 vmcs12->host_fs_base = evmcs->host_fs_base; 1572 vmcs12->host_gs_base = evmcs->host_gs_base; 1573 vmcs12->host_tr_base = evmcs->host_tr_base; 1574 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1575 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1576 vmcs12->host_rsp = evmcs->host_rsp; 1577 } 1578 1579 if (unlikely(!(evmcs->hv_clean_fields & 1580 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1581 vmcs12->ept_pointer = evmcs->ept_pointer; 1582 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1583 } 1584 1585 if (unlikely(!(evmcs->hv_clean_fields & 1586 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1587 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1588 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1589 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1590 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1591 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1592 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1593 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1594 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1595 vmcs12->guest_pending_dbg_exceptions = 1596 evmcs->guest_pending_dbg_exceptions; 1597 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1598 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1599 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1600 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1601 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1602 } 1603 1604 /* 1605 * Not used? 1606 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1607 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1608 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1609 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; 1610 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; 1611 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; 1612 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; 1613 * vmcs12->page_fault_error_code_mask = 1614 * evmcs->page_fault_error_code_mask; 1615 * vmcs12->page_fault_error_code_match = 1616 * evmcs->page_fault_error_code_match; 1617 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1618 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1619 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1620 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1621 */ 1622 1623 /* 1624 * Read only fields: 1625 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1626 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1627 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1628 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1629 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1630 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1631 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1632 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1633 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1634 * vmcs12->exit_qualification = evmcs->exit_qualification; 1635 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1636 * 1637 * Not present in struct vmcs12: 1638 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1639 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1640 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1641 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1642 */ 1643 1644 return 0; 1645 } 1646 1647 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1648 { 1649 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1650 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1651 1652 /* 1653 * Should not be changed by KVM: 1654 * 1655 * evmcs->host_es_selector = vmcs12->host_es_selector; 1656 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1657 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1658 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1659 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1660 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1661 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1662 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1663 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1664 * evmcs->host_cr0 = vmcs12->host_cr0; 1665 * evmcs->host_cr3 = vmcs12->host_cr3; 1666 * evmcs->host_cr4 = vmcs12->host_cr4; 1667 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1668 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1669 * evmcs->host_rip = vmcs12->host_rip; 1670 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1671 * evmcs->host_fs_base = vmcs12->host_fs_base; 1672 * evmcs->host_gs_base = vmcs12->host_gs_base; 1673 * evmcs->host_tr_base = vmcs12->host_tr_base; 1674 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1675 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1676 * evmcs->host_rsp = vmcs12->host_rsp; 1677 * sync_vmcs02_to_vmcs12() doesn't read these: 1678 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1679 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1680 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1681 * evmcs->ept_pointer = vmcs12->ept_pointer; 1682 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1683 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1684 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1685 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1686 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; 1687 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; 1688 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; 1689 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; 1690 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1691 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1692 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1693 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1694 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1695 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1696 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1697 * evmcs->page_fault_error_code_mask = 1698 * vmcs12->page_fault_error_code_mask; 1699 * evmcs->page_fault_error_code_match = 1700 * vmcs12->page_fault_error_code_match; 1701 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1702 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1703 * evmcs->tsc_offset = vmcs12->tsc_offset; 1704 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1705 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1706 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1707 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1708 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1709 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1710 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1711 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1712 * 1713 * Not present in struct vmcs12: 1714 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1715 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1716 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1717 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1718 */ 1719 1720 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1721 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1722 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1723 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1724 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1725 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1726 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1727 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1728 1729 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1730 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1731 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1732 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1733 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1734 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1735 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1736 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1737 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1738 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1739 1740 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1741 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1742 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1743 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1744 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1745 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1746 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1747 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1748 1749 evmcs->guest_es_base = vmcs12->guest_es_base; 1750 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1751 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1752 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1753 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1754 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1755 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1756 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1757 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1758 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1759 1760 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1761 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1762 1763 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1764 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1765 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1766 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1767 1768 evmcs->guest_pending_dbg_exceptions = 1769 vmcs12->guest_pending_dbg_exceptions; 1770 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1771 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1772 1773 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1774 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1775 1776 evmcs->guest_cr0 = vmcs12->guest_cr0; 1777 evmcs->guest_cr3 = vmcs12->guest_cr3; 1778 evmcs->guest_cr4 = vmcs12->guest_cr4; 1779 evmcs->guest_dr7 = vmcs12->guest_dr7; 1780 1781 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1782 1783 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1784 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1785 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1786 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1787 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1788 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1789 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1790 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1791 1792 evmcs->exit_qualification = vmcs12->exit_qualification; 1793 1794 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1795 evmcs->guest_rsp = vmcs12->guest_rsp; 1796 evmcs->guest_rflags = vmcs12->guest_rflags; 1797 1798 evmcs->guest_interruptibility_info = 1799 vmcs12->guest_interruptibility_info; 1800 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1801 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1802 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1803 evmcs->vm_entry_exception_error_code = 1804 vmcs12->vm_entry_exception_error_code; 1805 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1806 1807 evmcs->guest_rip = vmcs12->guest_rip; 1808 1809 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1810 1811 return 0; 1812 } 1813 1814 /* 1815 * This is an equivalent of the nested hypervisor executing the vmptrld 1816 * instruction. 1817 */ 1818 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, 1819 bool from_launch) 1820 { 1821 struct vcpu_vmx *vmx = to_vmx(vcpu); 1822 bool evmcs_gpa_changed = false; 1823 u64 evmcs_gpa; 1824 1825 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1826 return 1; 1827 1828 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) 1829 return 1; 1830 1831 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1832 if (!vmx->nested.hv_evmcs) 1833 vmx->nested.current_vmptr = -1ull; 1834 1835 nested_release_evmcs(vcpu); 1836 1837 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1838 &vmx->nested.hv_evmcs_map)) 1839 return 0; 1840 1841 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1842 1843 /* 1844 * Currently, KVM only supports eVMCS version 1 1845 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 1846 * value to first u32 field of eVMCS which should specify eVMCS 1847 * VersionNumber. 1848 * 1849 * Guest should be aware of supported eVMCS versions by host by 1850 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 1851 * expected to set this CPUID leaf according to the value 1852 * returned in vmcs_version from nested_enable_evmcs(). 1853 * 1854 * However, it turns out that Microsoft Hyper-V fails to comply 1855 * to their own invented interface: When Hyper-V use eVMCS, it 1856 * just sets first u32 field of eVMCS to revision_id specified 1857 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 1858 * which is one of the supported versions specified in 1859 * CPUID.0x4000000A.EAX[0:15]. 1860 * 1861 * To overcome Hyper-V bug, we accept here either a supported 1862 * eVMCS version or VMCS12 revision_id as valid values for first 1863 * u32 field of eVMCS. 1864 */ 1865 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 1866 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 1867 nested_release_evmcs(vcpu); 1868 return 0; 1869 } 1870 1871 vmx->nested.dirty_vmcs12 = true; 1872 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 1873 1874 evmcs_gpa_changed = true; 1875 /* 1876 * Unlike normal vmcs12, enlightened vmcs12 is not fully 1877 * reloaded from guest's memory (read only fields, fields not 1878 * present in struct hv_enlightened_vmcs, ...). Make sure there 1879 * are no leftovers. 1880 */ 1881 if (from_launch) { 1882 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1883 memset(vmcs12, 0, sizeof(*vmcs12)); 1884 vmcs12->hdr.revision_id = VMCS12_REVISION; 1885 } 1886 1887 } 1888 1889 /* 1890 * Clean fields data can't de used on VMLAUNCH and when we switch 1891 * between different L2 guests as KVM keeps a single VMCS12 per L1. 1892 */ 1893 if (from_launch || evmcs_gpa_changed) 1894 vmx->nested.hv_evmcs->hv_clean_fields &= 1895 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 1896 1897 return 1; 1898 } 1899 1900 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 1901 { 1902 struct vcpu_vmx *vmx = to_vmx(vcpu); 1903 1904 /* 1905 * hv_evmcs may end up being not mapped after migration (when 1906 * L2 was running), map it here to make sure vmcs12 changes are 1907 * properly reflected. 1908 */ 1909 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) 1910 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 1911 1912 if (vmx->nested.hv_evmcs) { 1913 copy_vmcs12_to_enlightened(vmx); 1914 /* All fields are clean */ 1915 vmx->nested.hv_evmcs->hv_clean_fields |= 1916 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 1917 } else { 1918 copy_vmcs12_to_shadow(vmx); 1919 } 1920 1921 vmx->nested.need_vmcs12_to_shadow_sync = false; 1922 } 1923 1924 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 1925 { 1926 struct vcpu_vmx *vmx = 1927 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 1928 1929 vmx->nested.preemption_timer_expired = true; 1930 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 1931 kvm_vcpu_kick(&vmx->vcpu); 1932 1933 return HRTIMER_NORESTART; 1934 } 1935 1936 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 1937 { 1938 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 1939 struct vcpu_vmx *vmx = to_vmx(vcpu); 1940 1941 /* 1942 * A timer value of zero is architecturally guaranteed to cause 1943 * a VMExit prior to executing any instructions in the guest. 1944 */ 1945 if (preemption_timeout == 0) { 1946 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 1947 return; 1948 } 1949 1950 if (vcpu->arch.virtual_tsc_khz == 0) 1951 return; 1952 1953 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 1954 preemption_timeout *= 1000000; 1955 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 1956 hrtimer_start(&vmx->nested.preemption_timer, 1957 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 1958 } 1959 1960 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 1961 { 1962 if (vmx->nested.nested_run_pending && 1963 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 1964 return vmcs12->guest_ia32_efer; 1965 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 1966 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 1967 else 1968 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 1969 } 1970 1971 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 1972 { 1973 /* 1974 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 1975 * according to L0's settings (vmcs12 is irrelevant here). Host 1976 * fields that come from L0 and are not constant, e.g. HOST_CR3, 1977 * will be set as needed prior to VMLAUNCH/VMRESUME. 1978 */ 1979 if (vmx->nested.vmcs02_initialized) 1980 return; 1981 vmx->nested.vmcs02_initialized = true; 1982 1983 /* 1984 * We don't care what the EPTP value is we just need to guarantee 1985 * it's valid so we don't get a false positive when doing early 1986 * consistency checks. 1987 */ 1988 if (enable_ept && nested_early_check) 1989 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); 1990 1991 /* All VMFUNCs are currently emulated through L0 vmexits. */ 1992 if (cpu_has_vmx_vmfunc()) 1993 vmcs_write64(VM_FUNCTION_CONTROL, 0); 1994 1995 if (cpu_has_vmx_posted_intr()) 1996 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 1997 1998 if (cpu_has_vmx_msr_bitmap()) 1999 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2000 2001 /* 2002 * The PML address never changes, so it is constant in vmcs02. 2003 * Conceptually we want to copy the PML index from vmcs01 here, 2004 * and then back to vmcs01 on nested vmexit. But since we flush 2005 * the log and reset GUEST_PML_INDEX on each vmexit, the PML 2006 * index is also effectively constant in vmcs02. 2007 */ 2008 if (enable_pml) { 2009 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 2010 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 2011 } 2012 2013 if (cpu_has_vmx_encls_vmexit()) 2014 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 2015 2016 /* 2017 * Set the MSR load/store lists to match L0's settings. Only the 2018 * addresses are constant (for vmcs02), the counts can change based 2019 * on L2's behavior, e.g. switching to/from long mode. 2020 */ 2021 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 2022 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2023 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2024 2025 vmx_set_constant_host_state(vmx); 2026 } 2027 2028 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2029 struct vmcs12 *vmcs12) 2030 { 2031 prepare_vmcs02_constant_state(vmx); 2032 2033 vmcs_write64(VMCS_LINK_POINTER, -1ull); 2034 2035 if (enable_vpid) { 2036 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2037 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2038 else 2039 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2040 } 2041 } 2042 2043 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2044 { 2045 u32 exec_control, vmcs12_exec_ctrl; 2046 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2047 2048 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) 2049 prepare_vmcs02_early_rare(vmx, vmcs12); 2050 2051 /* 2052 * PIN CONTROLS 2053 */ 2054 exec_control = vmx_pin_based_exec_ctrl(vmx); 2055 exec_control |= (vmcs12->pin_based_vm_exec_control & 2056 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2057 2058 /* Posted interrupts setting is only taken from vmcs12. */ 2059 if (nested_cpu_has_posted_intr(vmcs12)) { 2060 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2061 vmx->nested.pi_pending = false; 2062 } else { 2063 exec_control &= ~PIN_BASED_POSTED_INTR; 2064 } 2065 pin_controls_set(vmx, exec_control); 2066 2067 /* 2068 * EXEC CONTROLS 2069 */ 2070 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2071 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 2072 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 2073 exec_control &= ~CPU_BASED_TPR_SHADOW; 2074 exec_control |= vmcs12->cpu_based_vm_exec_control; 2075 2076 if (exec_control & CPU_BASED_TPR_SHADOW) 2077 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2078 #ifdef CONFIG_X86_64 2079 else 2080 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2081 CPU_BASED_CR8_STORE_EXITING; 2082 #endif 2083 2084 /* 2085 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2086 * for I/O port accesses. 2087 */ 2088 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2089 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2090 2091 /* 2092 * This bit will be computed in nested_get_vmcs12_pages, because 2093 * we do not have access to L1's MSR bitmap yet. For now, keep 2094 * the same bit as before, hoping to avoid multiple VMWRITEs that 2095 * only set/clear this bit. 2096 */ 2097 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2098 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2099 2100 exec_controls_set(vmx, exec_control); 2101 2102 /* 2103 * SECONDARY EXEC CONTROLS 2104 */ 2105 if (cpu_has_secondary_exec_ctrls()) { 2106 exec_control = vmx->secondary_exec_control; 2107 2108 /* Take the following fields only from vmcs12 */ 2109 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2110 SECONDARY_EXEC_ENABLE_INVPCID | 2111 SECONDARY_EXEC_RDTSCP | 2112 SECONDARY_EXEC_XSAVES | 2113 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2114 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2115 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2116 SECONDARY_EXEC_ENABLE_VMFUNC); 2117 if (nested_cpu_has(vmcs12, 2118 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { 2119 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & 2120 ~SECONDARY_EXEC_ENABLE_PML; 2121 exec_control |= vmcs12_exec_ctrl; 2122 } 2123 2124 /* VMCS shadowing for L2 is emulated for now */ 2125 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2126 2127 /* 2128 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2129 * will not have to rewrite the controls just for this bit. 2130 */ 2131 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2132 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2133 exec_control |= SECONDARY_EXEC_DESC; 2134 2135 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2136 vmcs_write16(GUEST_INTR_STATUS, 2137 vmcs12->guest_intr_status); 2138 2139 secondary_exec_controls_set(vmx, exec_control); 2140 } 2141 2142 /* 2143 * ENTRY CONTROLS 2144 * 2145 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2146 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2147 * on the related bits (if supported by the CPU) in the hope that 2148 * we can avoid VMWrites during vmx_set_efer(). 2149 */ 2150 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & 2151 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; 2152 if (cpu_has_load_ia32_efer()) { 2153 if (guest_efer & EFER_LMA) 2154 exec_control |= VM_ENTRY_IA32E_MODE; 2155 if (guest_efer != host_efer) 2156 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2157 } 2158 vm_entry_controls_set(vmx, exec_control); 2159 2160 /* 2161 * EXIT CONTROLS 2162 * 2163 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2164 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2165 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2166 */ 2167 exec_control = vmx_vmexit_ctrl(); 2168 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2169 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2170 vm_exit_controls_set(vmx, exec_control); 2171 2172 /* 2173 * Interrupt/Exception Fields 2174 */ 2175 if (vmx->nested.nested_run_pending) { 2176 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2177 vmcs12->vm_entry_intr_info_field); 2178 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2179 vmcs12->vm_entry_exception_error_code); 2180 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2181 vmcs12->vm_entry_instruction_len); 2182 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2183 vmcs12->guest_interruptibility_info); 2184 vmx->loaded_vmcs->nmi_known_unmasked = 2185 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2186 } else { 2187 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2188 } 2189 } 2190 2191 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2192 { 2193 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2194 2195 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2196 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2197 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2198 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2199 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2200 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2201 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2202 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2203 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2204 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2205 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2206 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2207 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2208 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2209 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2210 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2211 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2212 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2213 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2214 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2215 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2216 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2217 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2218 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2219 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2220 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2221 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2222 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2223 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2224 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2225 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2226 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2227 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2228 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2229 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2230 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2231 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2232 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2233 } 2234 2235 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2236 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2237 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2238 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2239 vmcs12->guest_pending_dbg_exceptions); 2240 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2241 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2242 2243 /* 2244 * L1 may access the L2's PDPTR, so save them to construct 2245 * vmcs12 2246 */ 2247 if (enable_ept) { 2248 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2249 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2250 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2251 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2252 } 2253 2254 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2255 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2256 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2257 } 2258 2259 if (nested_cpu_has_xsaves(vmcs12)) 2260 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2261 2262 /* 2263 * Whether page-faults are trapped is determined by a combination of 2264 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 2265 * If enable_ept, L0 doesn't care about page faults and we should 2266 * set all of these to L1's desires. However, if !enable_ept, L0 does 2267 * care about (at least some) page faults, and because it is not easy 2268 * (if at all possible?) to merge L0 and L1's desires, we simply ask 2269 * to exit on each and every L2 page fault. This is done by setting 2270 * MASK=MATCH=0 and (see below) EB.PF=1. 2271 * Note that below we don't need special code to set EB.PF beyond the 2272 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2273 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2274 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2275 */ 2276 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 2277 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 2278 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 2279 enable_ept ? vmcs12->page_fault_error_code_match : 0); 2280 2281 if (cpu_has_vmx_apicv()) { 2282 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2283 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2284 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2285 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2286 } 2287 2288 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2289 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2290 2291 set_cr4_guest_host_mask(vmx); 2292 } 2293 2294 /* 2295 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2296 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2297 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2298 * guest in a way that will both be appropriate to L1's requests, and our 2299 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2300 * function also has additional necessary side-effects, like setting various 2301 * vcpu->arch fields. 2302 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2303 * is assigned to entry_failure_code on failure. 2304 */ 2305 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2306 u32 *entry_failure_code) 2307 { 2308 struct vcpu_vmx *vmx = to_vmx(vcpu); 2309 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2310 bool load_guest_pdptrs_vmcs12 = false; 2311 2312 if (vmx->nested.dirty_vmcs12 || hv_evmcs) { 2313 prepare_vmcs02_rare(vmx, vmcs12); 2314 vmx->nested.dirty_vmcs12 = false; 2315 2316 load_guest_pdptrs_vmcs12 = !hv_evmcs || 2317 !(hv_evmcs->hv_clean_fields & 2318 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2319 } 2320 2321 if (vmx->nested.nested_run_pending && 2322 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2323 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2324 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2325 } else { 2326 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2327 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2328 } 2329 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2330 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2331 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2332 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2333 2334 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2335 * bitwise-or of what L1 wants to trap for L2, and what we want to 2336 * trap. Note that CR0.TS also needs updating - we do this later. 2337 */ 2338 update_exception_bitmap(vcpu); 2339 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2340 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2341 2342 if (vmx->nested.nested_run_pending && 2343 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2344 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2345 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2346 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2347 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2348 } 2349 2350 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2351 2352 if (kvm_has_tsc_control) 2353 decache_tsc_multiplier(vmx); 2354 2355 if (enable_vpid) { 2356 /* 2357 * There is no direct mapping between vpid02 and vpid12, the 2358 * vpid02 is per-vCPU for L0 and reused while the value of 2359 * vpid12 is changed w/ one invvpid during nested vmentry. 2360 * The vpid12 is allocated by L1 for L2, so it will not 2361 * influence global bitmap(for vpid01 and vpid02 allocation) 2362 * even if spawn a lot of nested vCPUs. 2363 */ 2364 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { 2365 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 2366 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 2367 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); 2368 } 2369 } else { 2370 /* 2371 * If L1 use EPT, then L0 needs to execute INVEPT on 2372 * EPTP02 instead of EPTP01. Therefore, delay TLB 2373 * flush until vmcs02->eptp is fully updated by 2374 * KVM_REQ_LOAD_CR3. Note that this assumes 2375 * KVM_REQ_TLB_FLUSH is evaluated after 2376 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). 2377 */ 2378 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2379 } 2380 } 2381 2382 if (nested_cpu_has_ept(vmcs12)) 2383 nested_ept_init_mmu_context(vcpu); 2384 else if (nested_cpu_has2(vmcs12, 2385 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2386 vmx_flush_tlb(vcpu, true); 2387 2388 /* 2389 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2390 * bits which we consider mandatory enabled. 2391 * The CR0_READ_SHADOW is what L2 should have expected to read given 2392 * the specifications by L1; It's not enough to take 2393 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2394 * have more bits than L1 expected. 2395 */ 2396 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2397 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2398 2399 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2400 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2401 2402 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2403 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2404 vmx_set_efer(vcpu, vcpu->arch.efer); 2405 2406 /* 2407 * Guest state is invalid and unrestricted guest is disabled, 2408 * which means L1 attempted VMEntry to L2 with invalid state. 2409 * Fail the VMEntry. 2410 */ 2411 if (vmx->emulation_required) { 2412 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2413 return -EINVAL; 2414 } 2415 2416 /* Shadow page tables on either EPT or shadow page tables. */ 2417 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2418 entry_failure_code)) 2419 return -EINVAL; 2420 2421 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2422 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2423 is_pae_paging(vcpu)) { 2424 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2425 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2426 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2427 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2428 } 2429 2430 if (!enable_ept) 2431 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2432 2433 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2434 kvm_rip_write(vcpu, vmcs12->guest_rip); 2435 return 0; 2436 } 2437 2438 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2439 { 2440 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2441 nested_cpu_has_virtual_nmis(vmcs12))) 2442 return -EINVAL; 2443 2444 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2445 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))) 2446 return -EINVAL; 2447 2448 return 0; 2449 } 2450 2451 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) 2452 { 2453 struct vcpu_vmx *vmx = to_vmx(vcpu); 2454 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2455 2456 /* Check for memory type validity */ 2457 switch (address & VMX_EPTP_MT_MASK) { 2458 case VMX_EPTP_MT_UC: 2459 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2460 return false; 2461 break; 2462 case VMX_EPTP_MT_WB: 2463 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2464 return false; 2465 break; 2466 default: 2467 return false; 2468 } 2469 2470 /* only 4 levels page-walk length are valid */ 2471 if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) 2472 return false; 2473 2474 /* Reserved bits should not be set */ 2475 if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) 2476 return false; 2477 2478 /* AD, if set, should be supported */ 2479 if (address & VMX_EPTP_AD_ENABLE_BIT) { 2480 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2481 return false; 2482 } 2483 2484 return true; 2485 } 2486 2487 /* 2488 * Checks related to VM-Execution Control Fields 2489 */ 2490 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2491 struct vmcs12 *vmcs12) 2492 { 2493 struct vcpu_vmx *vmx = to_vmx(vcpu); 2494 2495 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2496 vmx->nested.msrs.pinbased_ctls_low, 2497 vmx->nested.msrs.pinbased_ctls_high)) || 2498 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2499 vmx->nested.msrs.procbased_ctls_low, 2500 vmx->nested.msrs.procbased_ctls_high))) 2501 return -EINVAL; 2502 2503 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2504 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2505 vmx->nested.msrs.secondary_ctls_low, 2506 vmx->nested.msrs.secondary_ctls_high))) 2507 return -EINVAL; 2508 2509 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2510 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2511 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2512 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2513 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2514 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2515 nested_vmx_check_nmi_controls(vmcs12) || 2516 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2517 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2518 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2519 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2520 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2521 return -EINVAL; 2522 2523 if (!nested_cpu_has_preemption_timer(vmcs12) && 2524 nested_cpu_has_save_preemption_timer(vmcs12)) 2525 return -EINVAL; 2526 2527 if (nested_cpu_has_ept(vmcs12) && 2528 CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) 2529 return -EINVAL; 2530 2531 if (nested_cpu_has_vmfunc(vmcs12)) { 2532 if (CC(vmcs12->vm_function_control & 2533 ~vmx->nested.msrs.vmfunc_controls)) 2534 return -EINVAL; 2535 2536 if (nested_cpu_has_eptp_switching(vmcs12)) { 2537 if (CC(!nested_cpu_has_ept(vmcs12)) || 2538 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2539 return -EINVAL; 2540 } 2541 } 2542 2543 return 0; 2544 } 2545 2546 /* 2547 * Checks related to VM-Exit Control Fields 2548 */ 2549 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2550 struct vmcs12 *vmcs12) 2551 { 2552 struct vcpu_vmx *vmx = to_vmx(vcpu); 2553 2554 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2555 vmx->nested.msrs.exit_ctls_low, 2556 vmx->nested.msrs.exit_ctls_high)) || 2557 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2558 return -EINVAL; 2559 2560 return 0; 2561 } 2562 2563 /* 2564 * Checks related to VM-Entry Control Fields 2565 */ 2566 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2567 struct vmcs12 *vmcs12) 2568 { 2569 struct vcpu_vmx *vmx = to_vmx(vcpu); 2570 2571 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2572 vmx->nested.msrs.entry_ctls_low, 2573 vmx->nested.msrs.entry_ctls_high))) 2574 return -EINVAL; 2575 2576 /* 2577 * From the Intel SDM, volume 3: 2578 * Fields relevant to VM-entry event injection must be set properly. 2579 * These fields are the VM-entry interruption-information field, the 2580 * VM-entry exception error code, and the VM-entry instruction length. 2581 */ 2582 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2583 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2584 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2585 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2586 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2587 bool should_have_error_code; 2588 bool urg = nested_cpu_has2(vmcs12, 2589 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2590 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2591 2592 /* VM-entry interruption-info field: interruption type */ 2593 if (CC(intr_type == INTR_TYPE_RESERVED) || 2594 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2595 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2596 return -EINVAL; 2597 2598 /* VM-entry interruption-info field: vector */ 2599 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2600 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2601 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2602 return -EINVAL; 2603 2604 /* VM-entry interruption-info field: deliver error code */ 2605 should_have_error_code = 2606 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2607 x86_exception_has_error_code(vector); 2608 if (CC(has_error_code != should_have_error_code)) 2609 return -EINVAL; 2610 2611 /* VM-entry exception error code */ 2612 if (CC(has_error_code && 2613 vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) 2614 return -EINVAL; 2615 2616 /* VM-entry interruption-info field: reserved bits */ 2617 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2618 return -EINVAL; 2619 2620 /* VM-entry instruction length */ 2621 switch (intr_type) { 2622 case INTR_TYPE_SOFT_EXCEPTION: 2623 case INTR_TYPE_SOFT_INTR: 2624 case INTR_TYPE_PRIV_SW_EXCEPTION: 2625 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2626 CC(vmcs12->vm_entry_instruction_len == 0 && 2627 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2628 return -EINVAL; 2629 } 2630 } 2631 2632 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2633 return -EINVAL; 2634 2635 return 0; 2636 } 2637 2638 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2639 struct vmcs12 *vmcs12) 2640 { 2641 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2642 nested_check_vm_exit_controls(vcpu, vmcs12) || 2643 nested_check_vm_entry_controls(vcpu, vmcs12)) 2644 return -EINVAL; 2645 2646 return 0; 2647 } 2648 2649 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2650 struct vmcs12 *vmcs12) 2651 { 2652 bool ia32e; 2653 2654 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2655 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2656 CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3))) 2657 return -EINVAL; 2658 2659 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2660 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2661 return -EINVAL; 2662 2663 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2664 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2665 return -EINVAL; 2666 2667 #ifdef CONFIG_X86_64 2668 ia32e = !!(vcpu->arch.efer & EFER_LMA); 2669 #else 2670 ia32e = false; 2671 #endif 2672 2673 if (ia32e) { 2674 if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) || 2675 CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2676 return -EINVAL; 2677 } else { 2678 if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) || 2679 CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2680 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2681 CC((vmcs12->host_rip) >> 32)) 2682 return -EINVAL; 2683 } 2684 2685 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2686 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2687 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2688 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2689 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2690 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2691 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2692 CC(vmcs12->host_cs_selector == 0) || 2693 CC(vmcs12->host_tr_selector == 0) || 2694 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2695 return -EINVAL; 2696 2697 #ifdef CONFIG_X86_64 2698 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2699 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2700 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2701 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2702 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2703 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2704 return -EINVAL; 2705 #endif 2706 2707 /* 2708 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2709 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2710 * the values of the LMA and LME bits in the field must each be that of 2711 * the host address-space size VM-exit control. 2712 */ 2713 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2714 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2715 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2716 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2717 return -EINVAL; 2718 } 2719 2720 return 0; 2721 } 2722 2723 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2724 struct vmcs12 *vmcs12) 2725 { 2726 int r = 0; 2727 struct vmcs12 *shadow; 2728 struct kvm_host_map map; 2729 2730 if (vmcs12->vmcs_link_pointer == -1ull) 2731 return 0; 2732 2733 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2734 return -EINVAL; 2735 2736 if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))) 2737 return -EINVAL; 2738 2739 shadow = map.hva; 2740 2741 if (CC(shadow->hdr.revision_id != VMCS12_REVISION) || 2742 CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 2743 r = -EINVAL; 2744 2745 kvm_vcpu_unmap(vcpu, &map, false); 2746 return r; 2747 } 2748 2749 /* 2750 * Checks related to Guest Non-register State 2751 */ 2752 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2753 { 2754 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2755 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)) 2756 return -EINVAL; 2757 2758 return 0; 2759 } 2760 2761 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2762 struct vmcs12 *vmcs12, 2763 u32 *exit_qual) 2764 { 2765 bool ia32e; 2766 2767 *exit_qual = ENTRY_FAIL_DEFAULT; 2768 2769 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 2770 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 2771 return -EINVAL; 2772 2773 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2774 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 2775 return -EINVAL; 2776 2777 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2778 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; 2779 return -EINVAL; 2780 } 2781 2782 /* 2783 * If the load IA32_EFER VM-entry control is 1, the following checks 2784 * are performed on the field for the IA32_EFER MSR: 2785 * - Bits reserved in the IA32_EFER MSR must be 0. 2786 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2787 * the IA-32e mode guest VM-exit control. It must also be identical 2788 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2789 * CR0.PG) is 1. 2790 */ 2791 if (to_vmx(vcpu)->nested.nested_run_pending && 2792 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2793 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2794 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 2795 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 2796 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 2797 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 2798 return -EINVAL; 2799 } 2800 2801 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2802 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 2803 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 2804 return -EINVAL; 2805 2806 if (nested_check_guest_non_reg_state(vmcs12)) 2807 return -EINVAL; 2808 2809 return 0; 2810 } 2811 2812 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 2813 { 2814 struct vcpu_vmx *vmx = to_vmx(vcpu); 2815 unsigned long cr3, cr4; 2816 bool vm_fail; 2817 2818 if (!nested_early_check) 2819 return 0; 2820 2821 if (vmx->msr_autoload.host.nr) 2822 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 2823 if (vmx->msr_autoload.guest.nr) 2824 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 2825 2826 preempt_disable(); 2827 2828 vmx_prepare_switch_to_guest(vcpu); 2829 2830 /* 2831 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 2832 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 2833 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. 2834 * there is no need to preserve other bits or save/restore the field. 2835 */ 2836 vmcs_writel(GUEST_RFLAGS, 0); 2837 2838 cr3 = __get_current_cr3_fast(); 2839 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 2840 vmcs_writel(HOST_CR3, cr3); 2841 vmx->loaded_vmcs->host_state.cr3 = cr3; 2842 } 2843 2844 cr4 = cr4_read_shadow(); 2845 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 2846 vmcs_writel(HOST_CR4, cr4); 2847 vmx->loaded_vmcs->host_state.cr4 = cr4; 2848 } 2849 2850 asm( 2851 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ 2852 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2853 "je 1f \n\t" 2854 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" 2855 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2856 "1: \n\t" 2857 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ 2858 2859 /* Check if vmlaunch or vmresume is needed */ 2860 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" 2861 2862 /* 2863 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set 2864 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail 2865 * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the 2866 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. 2867 */ 2868 "call vmx_vmenter\n\t" 2869 2870 CC_SET(be) 2871 : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) 2872 : [HOST_RSP]"r"((unsigned long)HOST_RSP), 2873 [loaded_vmcs]"r"(vmx->loaded_vmcs), 2874 [launched]"i"(offsetof(struct loaded_vmcs, launched)), 2875 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), 2876 [wordsize]"i"(sizeof(ulong)) 2877 : "memory" 2878 ); 2879 2880 if (vmx->msr_autoload.host.nr) 2881 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2882 if (vmx->msr_autoload.guest.nr) 2883 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2884 2885 if (vm_fail) { 2886 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 2887 2888 preempt_enable(); 2889 2890 trace_kvm_nested_vmenter_failed( 2891 "early hardware check VM-instruction error: ", error); 2892 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 2893 return 1; 2894 } 2895 2896 /* 2897 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 2898 */ 2899 local_irq_enable(); 2900 if (hw_breakpoint_active()) 2901 set_debugreg(__this_cpu_read(cpu_dr7), 7); 2902 preempt_enable(); 2903 2904 /* 2905 * A non-failing VMEntry means we somehow entered guest mode with 2906 * an illegal RIP, and that's just the tip of the iceberg. There 2907 * is no telling what memory has been modified or what state has 2908 * been exposed to unknown code. Hitting this all but guarantees 2909 * a (very critical) hardware issue. 2910 */ 2911 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 2912 VMX_EXIT_REASONS_FAILED_VMENTRY)); 2913 2914 return 0; 2915 } 2916 2917 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 2918 struct vmcs12 *vmcs12); 2919 2920 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 2921 { 2922 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2923 struct vcpu_vmx *vmx = to_vmx(vcpu); 2924 struct kvm_host_map *map; 2925 struct page *page; 2926 u64 hpa; 2927 2928 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 2929 /* 2930 * Translate L1 physical address to host physical 2931 * address for vmcs02. Keep the page pinned, so this 2932 * physical address remains valid. We keep a reference 2933 * to it so we can release it later. 2934 */ 2935 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 2936 kvm_release_page_dirty(vmx->nested.apic_access_page); 2937 vmx->nested.apic_access_page = NULL; 2938 } 2939 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 2940 /* 2941 * If translation failed, no matter: This feature asks 2942 * to exit when accessing the given address, and if it 2943 * can never be accessed, this feature won't do 2944 * anything anyway. 2945 */ 2946 if (!is_error_page(page)) { 2947 vmx->nested.apic_access_page = page; 2948 hpa = page_to_phys(vmx->nested.apic_access_page); 2949 vmcs_write64(APIC_ACCESS_ADDR, hpa); 2950 } else { 2951 secondary_exec_controls_clearbit(vmx, 2952 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 2953 } 2954 } 2955 2956 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 2957 map = &vmx->nested.virtual_apic_map; 2958 2959 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 2960 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 2961 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 2962 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 2963 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 2964 /* 2965 * The processor will never use the TPR shadow, simply 2966 * clear the bit from the execution control. Such a 2967 * configuration is useless, but it happens in tests. 2968 * For any other configuration, failing the vm entry is 2969 * _not_ what the processor does but it's basically the 2970 * only possibility we have. 2971 */ 2972 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 2973 } else { 2974 /* 2975 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 2976 * force VM-Entry to fail. 2977 */ 2978 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 2979 } 2980 } 2981 2982 if (nested_cpu_has_posted_intr(vmcs12)) { 2983 map = &vmx->nested.pi_desc_map; 2984 2985 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 2986 vmx->nested.pi_desc = 2987 (struct pi_desc *)(((void *)map->hva) + 2988 offset_in_page(vmcs12->posted_intr_desc_addr)); 2989 vmcs_write64(POSTED_INTR_DESC_ADDR, 2990 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 2991 } 2992 } 2993 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 2994 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 2995 else 2996 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 2997 } 2998 2999 /* 3000 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3001 * for running VMX instructions (except VMXON, whose prerequisites are 3002 * slightly different). It also specifies what exception to inject otherwise. 3003 * Note that many of these exceptions have priority over VM exits, so they 3004 * don't have to be checked again here. 3005 */ 3006 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3007 { 3008 if (!to_vmx(vcpu)->nested.vmxon) { 3009 kvm_queue_exception(vcpu, UD_VECTOR); 3010 return 0; 3011 } 3012 3013 if (vmx_get_cpl(vcpu)) { 3014 kvm_inject_gp(vcpu, 0); 3015 return 0; 3016 } 3017 3018 return 1; 3019 } 3020 3021 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3022 { 3023 u8 rvi = vmx_get_rvi(); 3024 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3025 3026 return ((rvi & 0xf0) > (vppr & 0xf0)); 3027 } 3028 3029 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3030 struct vmcs12 *vmcs12); 3031 3032 /* 3033 * If from_vmentry is false, this is being called from state restore (either RSM 3034 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3035 + * 3036 + * Returns: 3037 + * 0 - success, i.e. proceed with actual VMEnter 3038 + * 1 - consistency check VMExit 3039 + * -1 - consistency check VMFail 3040 */ 3041 int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) 3042 { 3043 struct vcpu_vmx *vmx = to_vmx(vcpu); 3044 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3045 bool evaluate_pending_interrupts; 3046 u32 exit_reason = EXIT_REASON_INVALID_STATE; 3047 u32 exit_qual; 3048 3049 evaluate_pending_interrupts = exec_controls_get(vmx) & 3050 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); 3051 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3052 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3053 3054 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3055 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3056 if (kvm_mpx_supported() && 3057 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3058 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3059 3060 /* 3061 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3062 * nested early checks are disabled. In the event of a "late" VM-Fail, 3063 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3064 * software model to the pre-VMEntry host state. When EPT is disabled, 3065 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3066 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3067 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3068 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3069 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3070 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3071 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3072 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3073 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3074 * path would need to manually save/restore vmcs01.GUEST_CR3. 3075 */ 3076 if (!enable_ept && !nested_early_check) 3077 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3078 3079 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3080 3081 prepare_vmcs02_early(vmx, vmcs12); 3082 3083 if (from_vmentry) { 3084 nested_get_vmcs12_pages(vcpu); 3085 3086 if (nested_vmx_check_vmentry_hw(vcpu)) { 3087 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3088 return -1; 3089 } 3090 3091 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 3092 goto vmentry_fail_vmexit; 3093 } 3094 3095 enter_guest_mode(vcpu); 3096 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3097 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 3098 3099 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) 3100 goto vmentry_fail_vmexit_guest_mode; 3101 3102 if (from_vmentry) { 3103 exit_reason = EXIT_REASON_MSR_LOAD_FAIL; 3104 exit_qual = nested_vmx_load_msr(vcpu, 3105 vmcs12->vm_entry_msr_load_addr, 3106 vmcs12->vm_entry_msr_load_count); 3107 if (exit_qual) 3108 goto vmentry_fail_vmexit_guest_mode; 3109 } else { 3110 /* 3111 * The MMU is not initialized to point at the right entities yet and 3112 * "get pages" would need to read data from the guest (i.e. we will 3113 * need to perform gpa to hpa translation). Request a call 3114 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3115 * have already been set at vmentry time and should not be reset. 3116 */ 3117 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 3118 } 3119 3120 /* 3121 * If L1 had a pending IRQ/NMI until it executed 3122 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3123 * disallowed (e.g. interrupts disabled), L0 needs to 3124 * evaluate if this pending event should cause an exit from L2 3125 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3126 * intercept EXTERNAL_INTERRUPT). 3127 * 3128 * Usually this would be handled by the processor noticing an 3129 * IRQ/NMI window request, or checking RVI during evaluation of 3130 * pending virtual interrupts. However, this setting was done 3131 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3132 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3133 */ 3134 if (unlikely(evaluate_pending_interrupts)) 3135 kvm_make_request(KVM_REQ_EVENT, vcpu); 3136 3137 /* 3138 * Do not start the preemption timer hrtimer until after we know 3139 * we are successful, so that only nested_vmx_vmexit needs to cancel 3140 * the timer. 3141 */ 3142 vmx->nested.preemption_timer_expired = false; 3143 if (nested_cpu_has_preemption_timer(vmcs12)) 3144 vmx_start_preemption_timer(vcpu); 3145 3146 /* 3147 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3148 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3149 * returned as far as L1 is concerned. It will only return (and set 3150 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3151 */ 3152 return 0; 3153 3154 /* 3155 * A failed consistency check that leads to a VMExit during L1's 3156 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3157 * 26.7 "VM-entry failures during or after loading guest state". 3158 */ 3159 vmentry_fail_vmexit_guest_mode: 3160 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3161 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3162 leave_guest_mode(vcpu); 3163 3164 vmentry_fail_vmexit: 3165 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3166 3167 if (!from_vmentry) 3168 return 1; 3169 3170 load_vmcs12_host_state(vcpu, vmcs12); 3171 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 3172 vmcs12->exit_qualification = exit_qual; 3173 if (enable_shadow_vmcs || vmx->nested.hv_evmcs) 3174 vmx->nested.need_vmcs12_to_shadow_sync = true; 3175 return 1; 3176 } 3177 3178 /* 3179 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3180 * for running an L2 nested guest. 3181 */ 3182 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3183 { 3184 struct vmcs12 *vmcs12; 3185 struct vcpu_vmx *vmx = to_vmx(vcpu); 3186 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3187 int ret; 3188 3189 if (!nested_vmx_check_permission(vcpu)) 3190 return 1; 3191 3192 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) 3193 return 1; 3194 3195 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) 3196 return nested_vmx_failInvalid(vcpu); 3197 3198 vmcs12 = get_vmcs12(vcpu); 3199 3200 /* 3201 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3202 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3203 * rather than RFLAGS.ZF, and no error number is stored to the 3204 * VM-instruction error field. 3205 */ 3206 if (vmcs12->hdr.shadow_vmcs) 3207 return nested_vmx_failInvalid(vcpu); 3208 3209 if (vmx->nested.hv_evmcs) { 3210 copy_enlightened_to_vmcs12(vmx); 3211 /* Enlightened VMCS doesn't have launch state */ 3212 vmcs12->launch_state = !launch; 3213 } else if (enable_shadow_vmcs) { 3214 copy_shadow_to_vmcs12(vmx); 3215 } 3216 3217 /* 3218 * The nested entry process starts with enforcing various prerequisites 3219 * on vmcs12 as required by the Intel SDM, and act appropriately when 3220 * they fail: As the SDM explains, some conditions should cause the 3221 * instruction to fail, while others will cause the instruction to seem 3222 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3223 * To speed up the normal (success) code path, we should avoid checking 3224 * for misconfigurations which will anyway be caught by the processor 3225 * when using the merged vmcs02. 3226 */ 3227 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) 3228 return nested_vmx_failValid(vcpu, 3229 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3230 3231 if (vmcs12->launch_state == launch) 3232 return nested_vmx_failValid(vcpu, 3233 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3234 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3235 3236 if (nested_vmx_check_controls(vcpu, vmcs12)) 3237 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3238 3239 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3240 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3241 3242 /* 3243 * We're finally done with prerequisite checking, and can start with 3244 * the nested entry. 3245 */ 3246 vmx->nested.nested_run_pending = 1; 3247 ret = nested_vmx_enter_non_root_mode(vcpu, true); 3248 vmx->nested.nested_run_pending = !ret; 3249 if (ret > 0) 3250 return 1; 3251 else if (ret) 3252 return nested_vmx_failValid(vcpu, 3253 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3254 3255 /* Hide L1D cache contents from the nested guest. */ 3256 vmx->vcpu.arch.l1tf_flush_l1d = true; 3257 3258 /* 3259 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3260 * also be used as part of restoring nVMX state for 3261 * snapshot restore (migration). 3262 * 3263 * In this flow, it is assumed that vmcs12 cache was 3264 * trasferred as part of captured nVMX state and should 3265 * therefore not be read from guest memory (which may not 3266 * exist on destination host yet). 3267 */ 3268 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3269 3270 /* 3271 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3272 * awakened by event injection or by an NMI-window VM-exit or 3273 * by an interrupt-window VM-exit, halt the vcpu. 3274 */ 3275 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && 3276 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3277 !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && 3278 !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && 3279 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3280 vmx->nested.nested_run_pending = 0; 3281 return kvm_vcpu_halt(vcpu); 3282 } 3283 return 1; 3284 } 3285 3286 /* 3287 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3288 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 3289 * This function returns the new value we should put in vmcs12.guest_cr0. 3290 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3291 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3292 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3293 * didn't trap the bit, because if L1 did, so would L0). 3294 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3295 * been modified by L2, and L1 knows it. So just leave the old value of 3296 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3297 * isn't relevant, because if L0 traps this bit it can set it to anything. 3298 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3299 * changed these bits, and therefore they need to be updated, but L0 3300 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3301 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3302 */ 3303 static inline unsigned long 3304 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3305 { 3306 return 3307 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3308 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3309 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3310 vcpu->arch.cr0_guest_owned_bits)); 3311 } 3312 3313 static inline unsigned long 3314 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3315 { 3316 return 3317 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3318 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3319 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3320 vcpu->arch.cr4_guest_owned_bits)); 3321 } 3322 3323 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3324 struct vmcs12 *vmcs12) 3325 { 3326 u32 idt_vectoring; 3327 unsigned int nr; 3328 3329 if (vcpu->arch.exception.injected) { 3330 nr = vcpu->arch.exception.nr; 3331 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3332 3333 if (kvm_exception_is_soft(nr)) { 3334 vmcs12->vm_exit_instruction_len = 3335 vcpu->arch.event_exit_inst_len; 3336 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3337 } else 3338 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3339 3340 if (vcpu->arch.exception.has_error_code) { 3341 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3342 vmcs12->idt_vectoring_error_code = 3343 vcpu->arch.exception.error_code; 3344 } 3345 3346 vmcs12->idt_vectoring_info_field = idt_vectoring; 3347 } else if (vcpu->arch.nmi_injected) { 3348 vmcs12->idt_vectoring_info_field = 3349 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3350 } else if (vcpu->arch.interrupt.injected) { 3351 nr = vcpu->arch.interrupt.nr; 3352 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3353 3354 if (vcpu->arch.interrupt.soft) { 3355 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3356 vmcs12->vm_entry_instruction_len = 3357 vcpu->arch.event_exit_inst_len; 3358 } else 3359 idt_vectoring |= INTR_TYPE_EXT_INTR; 3360 3361 vmcs12->idt_vectoring_info_field = idt_vectoring; 3362 } 3363 } 3364 3365 3366 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3367 { 3368 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3369 gfn_t gfn; 3370 3371 /* 3372 * Don't need to mark the APIC access page dirty; it is never 3373 * written to by the CPU during APIC virtualization. 3374 */ 3375 3376 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3377 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3378 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3379 } 3380 3381 if (nested_cpu_has_posted_intr(vmcs12)) { 3382 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3383 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3384 } 3385 } 3386 3387 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3388 { 3389 struct vcpu_vmx *vmx = to_vmx(vcpu); 3390 int max_irr; 3391 void *vapic_page; 3392 u16 status; 3393 3394 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) 3395 return; 3396 3397 vmx->nested.pi_pending = false; 3398 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3399 return; 3400 3401 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3402 if (max_irr != 256) { 3403 vapic_page = vmx->nested.virtual_apic_map.hva; 3404 if (!vapic_page) 3405 return; 3406 3407 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3408 vapic_page, &max_irr); 3409 status = vmcs_read16(GUEST_INTR_STATUS); 3410 if ((u8)max_irr > ((u8)status & 0xff)) { 3411 status &= ~0xff; 3412 status |= (u8)max_irr; 3413 vmcs_write16(GUEST_INTR_STATUS, status); 3414 } 3415 } 3416 3417 nested_mark_vmcs12_pages_dirty(vcpu); 3418 } 3419 3420 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3421 unsigned long exit_qual) 3422 { 3423 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3424 unsigned int nr = vcpu->arch.exception.nr; 3425 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3426 3427 if (vcpu->arch.exception.has_error_code) { 3428 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3429 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3430 } 3431 3432 if (kvm_exception_is_soft(nr)) 3433 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3434 else 3435 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3436 3437 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3438 vmx_get_nmi_mask(vcpu)) 3439 intr_info |= INTR_INFO_UNBLOCK_NMI; 3440 3441 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3442 } 3443 3444 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 3445 { 3446 struct vcpu_vmx *vmx = to_vmx(vcpu); 3447 unsigned long exit_qual; 3448 bool block_nested_events = 3449 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3450 struct kvm_lapic *apic = vcpu->arch.apic; 3451 3452 if (lapic_in_kernel(vcpu) && 3453 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 3454 if (block_nested_events) 3455 return -EBUSY; 3456 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 3457 return 0; 3458 } 3459 3460 if (vcpu->arch.exception.pending && 3461 nested_vmx_check_exception(vcpu, &exit_qual)) { 3462 if (block_nested_events) 3463 return -EBUSY; 3464 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3465 return 0; 3466 } 3467 3468 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3469 vmx->nested.preemption_timer_expired) { 3470 if (block_nested_events) 3471 return -EBUSY; 3472 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3473 return 0; 3474 } 3475 3476 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 3477 if (block_nested_events) 3478 return -EBUSY; 3479 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3480 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3481 INTR_INFO_VALID_MASK, 0); 3482 /* 3483 * The NMI-triggered VM exit counts as injection: 3484 * clear this one and block further NMIs. 3485 */ 3486 vcpu->arch.nmi_pending = 0; 3487 vmx_set_nmi_mask(vcpu, true); 3488 return 0; 3489 } 3490 3491 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && 3492 nested_exit_on_intr(vcpu)) { 3493 if (block_nested_events) 3494 return -EBUSY; 3495 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3496 return 0; 3497 } 3498 3499 vmx_complete_nested_posted_interrupt(vcpu); 3500 return 0; 3501 } 3502 3503 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3504 { 3505 ktime_t remaining = 3506 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3507 u64 value; 3508 3509 if (ktime_to_ns(remaining) <= 0) 3510 return 0; 3511 3512 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3513 do_div(value, 1000000); 3514 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3515 } 3516 3517 static bool is_vmcs12_ext_field(unsigned long field) 3518 { 3519 switch (field) { 3520 case GUEST_ES_SELECTOR: 3521 case GUEST_CS_SELECTOR: 3522 case GUEST_SS_SELECTOR: 3523 case GUEST_DS_SELECTOR: 3524 case GUEST_FS_SELECTOR: 3525 case GUEST_GS_SELECTOR: 3526 case GUEST_LDTR_SELECTOR: 3527 case GUEST_TR_SELECTOR: 3528 case GUEST_ES_LIMIT: 3529 case GUEST_CS_LIMIT: 3530 case GUEST_SS_LIMIT: 3531 case GUEST_DS_LIMIT: 3532 case GUEST_FS_LIMIT: 3533 case GUEST_GS_LIMIT: 3534 case GUEST_LDTR_LIMIT: 3535 case GUEST_TR_LIMIT: 3536 case GUEST_GDTR_LIMIT: 3537 case GUEST_IDTR_LIMIT: 3538 case GUEST_ES_AR_BYTES: 3539 case GUEST_DS_AR_BYTES: 3540 case GUEST_FS_AR_BYTES: 3541 case GUEST_GS_AR_BYTES: 3542 case GUEST_LDTR_AR_BYTES: 3543 case GUEST_TR_AR_BYTES: 3544 case GUEST_ES_BASE: 3545 case GUEST_CS_BASE: 3546 case GUEST_SS_BASE: 3547 case GUEST_DS_BASE: 3548 case GUEST_FS_BASE: 3549 case GUEST_GS_BASE: 3550 case GUEST_LDTR_BASE: 3551 case GUEST_TR_BASE: 3552 case GUEST_GDTR_BASE: 3553 case GUEST_IDTR_BASE: 3554 case GUEST_PENDING_DBG_EXCEPTIONS: 3555 case GUEST_BNDCFGS: 3556 return true; 3557 default: 3558 break; 3559 } 3560 3561 return false; 3562 } 3563 3564 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3565 struct vmcs12 *vmcs12) 3566 { 3567 struct vcpu_vmx *vmx = to_vmx(vcpu); 3568 3569 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3570 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3571 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3572 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3573 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3574 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3575 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 3576 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 3577 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 3578 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 3579 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 3580 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 3581 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 3582 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 3583 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 3584 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 3585 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3586 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3587 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3588 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 3589 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 3590 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 3591 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 3592 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 3593 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 3594 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 3595 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 3596 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 3597 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 3598 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 3599 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 3600 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 3601 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 3602 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 3603 vmcs12->guest_pending_dbg_exceptions = 3604 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 3605 if (kvm_mpx_supported()) 3606 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3607 3608 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 3609 } 3610 3611 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3612 struct vmcs12 *vmcs12) 3613 { 3614 struct vcpu_vmx *vmx = to_vmx(vcpu); 3615 int cpu; 3616 3617 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 3618 return; 3619 3620 3621 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 3622 3623 cpu = get_cpu(); 3624 vmx->loaded_vmcs = &vmx->nested.vmcs02; 3625 vmx_vcpu_load(&vmx->vcpu, cpu); 3626 3627 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3628 3629 vmx->loaded_vmcs = &vmx->vmcs01; 3630 vmx_vcpu_load(&vmx->vcpu, cpu); 3631 put_cpu(); 3632 } 3633 3634 /* 3635 * Update the guest state fields of vmcs12 to reflect changes that 3636 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 3637 * VM-entry controls is also updated, since this is really a guest 3638 * state bit.) 3639 */ 3640 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3641 { 3642 struct vcpu_vmx *vmx = to_vmx(vcpu); 3643 3644 if (vmx->nested.hv_evmcs) 3645 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3646 3647 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; 3648 3649 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 3650 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 3651 3652 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 3653 vmcs12->guest_rip = kvm_rip_read(vcpu); 3654 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 3655 3656 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 3657 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 3658 3659 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 3660 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 3661 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 3662 3663 vmcs12->guest_interruptibility_info = 3664 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 3665 3666 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3667 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 3668 else 3669 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 3670 3671 if (nested_cpu_has_preemption_timer(vmcs12) && 3672 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) 3673 vmcs12->vmx_preemption_timer_value = 3674 vmx_get_preemption_timer_value(vcpu); 3675 3676 /* 3677 * In some cases (usually, nested EPT), L2 is allowed to change its 3678 * own CR3 without exiting. If it has changed it, we must keep it. 3679 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 3680 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 3681 * 3682 * Additionally, restore L2's PDPTR to vmcs12. 3683 */ 3684 if (enable_ept) { 3685 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 3686 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3687 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 3688 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 3689 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 3690 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 3691 } 3692 } 3693 3694 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 3695 3696 if (nested_cpu_has_vid(vmcs12)) 3697 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 3698 3699 vmcs12->vm_entry_controls = 3700 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 3701 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 3702 3703 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 3704 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 3705 3706 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 3707 vmcs12->guest_ia32_efer = vcpu->arch.efer; 3708 } 3709 3710 /* 3711 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 3712 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 3713 * and this function updates it to reflect the changes to the guest state while 3714 * L2 was running (and perhaps made some exits which were handled directly by L0 3715 * without going back to L1), and to reflect the exit reason. 3716 * Note that we do not have to copy here all VMCS fields, just those that 3717 * could have changed by the L2 guest or the exit - i.e., the guest-state and 3718 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 3719 * which already writes to vmcs12 directly. 3720 */ 3721 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 3722 u32 exit_reason, u32 exit_intr_info, 3723 unsigned long exit_qualification) 3724 { 3725 /* update exit information fields: */ 3726 vmcs12->vm_exit_reason = exit_reason; 3727 vmcs12->exit_qualification = exit_qualification; 3728 vmcs12->vm_exit_intr_info = exit_intr_info; 3729 3730 vmcs12->idt_vectoring_info_field = 0; 3731 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3732 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 3733 3734 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 3735 vmcs12->launch_state = 1; 3736 3737 /* vm_entry_intr_info_field is cleared on exit. Emulate this 3738 * instead of reading the real value. */ 3739 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 3740 3741 /* 3742 * Transfer the event that L0 or L1 may wanted to inject into 3743 * L2 to IDT_VECTORING_INFO_FIELD. 3744 */ 3745 vmcs12_save_pending_event(vcpu, vmcs12); 3746 3747 /* 3748 * According to spec, there's no need to store the guest's 3749 * MSRs if the exit is due to a VM-entry failure that occurs 3750 * during or after loading the guest state. Since this exit 3751 * does not fall in that category, we need to save the MSRs. 3752 */ 3753 if (nested_vmx_store_msr(vcpu, 3754 vmcs12->vm_exit_msr_store_addr, 3755 vmcs12->vm_exit_msr_store_count)) 3756 nested_vmx_abort(vcpu, 3757 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 3758 } 3759 3760 /* 3761 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 3762 * preserved above and would only end up incorrectly in L1. 3763 */ 3764 vcpu->arch.nmi_injected = false; 3765 kvm_clear_exception_queue(vcpu); 3766 kvm_clear_interrupt_queue(vcpu); 3767 } 3768 3769 /* 3770 * A part of what we need to when the nested L2 guest exits and we want to 3771 * run its L1 parent, is to reset L1's guest state to the host state specified 3772 * in vmcs12. 3773 * This function is to be called not only on normal nested exit, but also on 3774 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 3775 * Failures During or After Loading Guest State"). 3776 * This function should be called when the active VMCS is L1's (vmcs01). 3777 */ 3778 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3779 struct vmcs12 *vmcs12) 3780 { 3781 struct kvm_segment seg; 3782 u32 entry_failure_code; 3783 3784 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 3785 vcpu->arch.efer = vmcs12->host_ia32_efer; 3786 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 3787 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 3788 else 3789 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 3790 vmx_set_efer(vcpu, vcpu->arch.efer); 3791 3792 kvm_rsp_write(vcpu, vmcs12->host_rsp); 3793 kvm_rip_write(vcpu, vmcs12->host_rip); 3794 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 3795 vmx_set_interrupt_shadow(vcpu, 0); 3796 3797 /* 3798 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 3799 * actually changed, because vmx_set_cr0 refers to efer set above. 3800 * 3801 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 3802 * (KVM doesn't change it); 3803 */ 3804 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 3805 vmx_set_cr0(vcpu, vmcs12->host_cr0); 3806 3807 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 3808 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 3809 vmx_set_cr4(vcpu, vmcs12->host_cr4); 3810 3811 nested_ept_uninit_mmu_context(vcpu); 3812 3813 /* 3814 * Only PDPTE load can fail as the value of cr3 was checked on entry and 3815 * couldn't have changed. 3816 */ 3817 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) 3818 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 3819 3820 if (!enable_ept) 3821 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 3822 3823 /* 3824 * If vmcs01 doesn't use VPID, CPU flushes TLB on every 3825 * VMEntry/VMExit. Thus, no need to flush TLB. 3826 * 3827 * If vmcs12 doesn't use VPID, L1 expects TLB to be 3828 * flushed on every VMEntry/VMExit. 3829 * 3830 * Otherwise, we can preserve TLB entries as long as we are 3831 * able to tag L1 TLB entries differently than L2 TLB entries. 3832 * 3833 * If vmcs12 uses EPT, we need to execute this flush on EPTP01 3834 * and therefore we request the TLB flush to happen only after VMCS EPTP 3835 * has been set by KVM_REQ_LOAD_CR3. 3836 */ 3837 if (enable_vpid && 3838 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { 3839 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 3840 } 3841 3842 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 3843 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 3844 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 3845 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 3846 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 3847 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 3848 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 3849 3850 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 3851 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 3852 vmcs_write64(GUEST_BNDCFGS, 0); 3853 3854 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 3855 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 3856 vcpu->arch.pat = vmcs12->host_ia32_pat; 3857 } 3858 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 3859 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 3860 vmcs12->host_ia32_perf_global_ctrl); 3861 3862 /* Set L1 segment info according to Intel SDM 3863 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 3864 seg = (struct kvm_segment) { 3865 .base = 0, 3866 .limit = 0xFFFFFFFF, 3867 .selector = vmcs12->host_cs_selector, 3868 .type = 11, 3869 .present = 1, 3870 .s = 1, 3871 .g = 1 3872 }; 3873 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 3874 seg.l = 1; 3875 else 3876 seg.db = 1; 3877 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 3878 seg = (struct kvm_segment) { 3879 .base = 0, 3880 .limit = 0xFFFFFFFF, 3881 .type = 3, 3882 .present = 1, 3883 .s = 1, 3884 .db = 1, 3885 .g = 1 3886 }; 3887 seg.selector = vmcs12->host_ds_selector; 3888 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 3889 seg.selector = vmcs12->host_es_selector; 3890 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 3891 seg.selector = vmcs12->host_ss_selector; 3892 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 3893 seg.selector = vmcs12->host_fs_selector; 3894 seg.base = vmcs12->host_fs_base; 3895 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 3896 seg.selector = vmcs12->host_gs_selector; 3897 seg.base = vmcs12->host_gs_base; 3898 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 3899 seg = (struct kvm_segment) { 3900 .base = vmcs12->host_tr_base, 3901 .limit = 0x67, 3902 .selector = vmcs12->host_tr_selector, 3903 .type = 11, 3904 .present = 1 3905 }; 3906 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 3907 3908 kvm_set_dr(vcpu, 7, 0x400); 3909 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 3910 3911 if (cpu_has_vmx_msr_bitmap()) 3912 vmx_update_msr_bitmap(vcpu); 3913 3914 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 3915 vmcs12->vm_exit_msr_load_count)) 3916 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 3917 } 3918 3919 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 3920 { 3921 struct shared_msr_entry *efer_msr; 3922 unsigned int i; 3923 3924 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 3925 return vmcs_read64(GUEST_IA32_EFER); 3926 3927 if (cpu_has_load_ia32_efer()) 3928 return host_efer; 3929 3930 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 3931 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 3932 return vmx->msr_autoload.guest.val[i].value; 3933 } 3934 3935 efer_msr = find_msr_entry(vmx, MSR_EFER); 3936 if (efer_msr) 3937 return efer_msr->data; 3938 3939 return host_efer; 3940 } 3941 3942 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 3943 { 3944 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3945 struct vcpu_vmx *vmx = to_vmx(vcpu); 3946 struct vmx_msr_entry g, h; 3947 gpa_t gpa; 3948 u32 i, j; 3949 3950 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 3951 3952 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 3953 /* 3954 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 3955 * as vmcs01.GUEST_DR7 contains a userspace defined value 3956 * and vcpu->arch.dr7 is not squirreled away before the 3957 * nested VMENTER (not worth adding a variable in nested_vmx). 3958 */ 3959 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 3960 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 3961 else 3962 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 3963 } 3964 3965 /* 3966 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 3967 * handle a variety of side effects to KVM's software model. 3968 */ 3969 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 3970 3971 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 3972 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 3973 3974 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 3975 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 3976 3977 nested_ept_uninit_mmu_context(vcpu); 3978 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3979 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 3980 3981 /* 3982 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 3983 * from vmcs01 (if necessary). The PDPTRs are not loaded on 3984 * VMFail, like everything else we just need to ensure our 3985 * software model is up-to-date. 3986 */ 3987 if (enable_ept) 3988 ept_save_pdptrs(vcpu); 3989 3990 kvm_mmu_reset_context(vcpu); 3991 3992 if (cpu_has_vmx_msr_bitmap()) 3993 vmx_update_msr_bitmap(vcpu); 3994 3995 /* 3996 * This nasty bit of open coding is a compromise between blindly 3997 * loading L1's MSRs using the exit load lists (incorrect emulation 3998 * of VMFail), leaving the nested VM's MSRs in the software model 3999 * (incorrect behavior) and snapshotting the modified MSRs (too 4000 * expensive since the lists are unbound by hardware). For each 4001 * MSR that was (prematurely) loaded from the nested VMEntry load 4002 * list, reload it from the exit load list if it exists and differs 4003 * from the guest value. The intent is to stuff host state as 4004 * silently as possible, not to fully process the exit load list. 4005 */ 4006 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4007 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4008 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4009 pr_debug_ratelimited( 4010 "%s read MSR index failed (%u, 0x%08llx)\n", 4011 __func__, i, gpa); 4012 goto vmabort; 4013 } 4014 4015 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4016 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4017 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4018 pr_debug_ratelimited( 4019 "%s read MSR failed (%u, 0x%08llx)\n", 4020 __func__, j, gpa); 4021 goto vmabort; 4022 } 4023 if (h.index != g.index) 4024 continue; 4025 if (h.value == g.value) 4026 break; 4027 4028 if (nested_vmx_load_msr_check(vcpu, &h)) { 4029 pr_debug_ratelimited( 4030 "%s check failed (%u, 0x%x, 0x%x)\n", 4031 __func__, j, h.index, h.reserved); 4032 goto vmabort; 4033 } 4034 4035 if (kvm_set_msr(vcpu, h.index, h.value)) { 4036 pr_debug_ratelimited( 4037 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4038 __func__, j, h.index, h.value); 4039 goto vmabort; 4040 } 4041 } 4042 } 4043 4044 return; 4045 4046 vmabort: 4047 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4048 } 4049 4050 /* 4051 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4052 * and modify vmcs12 to make it see what it would expect to see there if 4053 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4054 */ 4055 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 4056 u32 exit_intr_info, unsigned long exit_qualification) 4057 { 4058 struct vcpu_vmx *vmx = to_vmx(vcpu); 4059 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4060 4061 /* trying to cancel vmlaunch/vmresume is a bug */ 4062 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4063 4064 leave_guest_mode(vcpu); 4065 4066 if (nested_cpu_has_preemption_timer(vmcs12)) 4067 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4068 4069 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 4070 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4071 4072 if (likely(!vmx->fail)) { 4073 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4074 4075 if (exit_reason != -1) 4076 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 4077 exit_qualification); 4078 4079 /* 4080 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4081 * also be used to capture vmcs12 cache as part of 4082 * capturing nVMX state for snapshot (migration). 4083 * 4084 * Otherwise, this flush will dirty guest memory at a 4085 * point it is already assumed by user-space to be 4086 * immutable. 4087 */ 4088 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4089 } else { 4090 /* 4091 * The only expected VM-instruction error is "VM entry with 4092 * invalid control field(s)." Anything else indicates a 4093 * problem with L0. And we should never get here with a 4094 * VMFail of any type if early consistency checks are enabled. 4095 */ 4096 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4097 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4098 WARN_ON_ONCE(nested_early_check); 4099 } 4100 4101 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4102 4103 /* Update any VMCS fields that might have changed while L2 ran */ 4104 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4105 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4106 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4107 4108 if (kvm_has_tsc_control) 4109 decache_tsc_multiplier(vmx); 4110 4111 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4112 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4113 vmx_set_virtual_apic_mode(vcpu); 4114 } else if (!nested_cpu_has_ept(vmcs12) && 4115 nested_cpu_has2(vmcs12, 4116 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 4117 vmx_flush_tlb(vcpu, true); 4118 } 4119 4120 /* Unpin physical memory we referred to in vmcs02 */ 4121 if (vmx->nested.apic_access_page) { 4122 kvm_release_page_dirty(vmx->nested.apic_access_page); 4123 vmx->nested.apic_access_page = NULL; 4124 } 4125 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4126 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4127 vmx->nested.pi_desc = NULL; 4128 4129 /* 4130 * We are now running in L2, mmu_notifier will force to reload the 4131 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. 4132 */ 4133 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4134 4135 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) 4136 vmx->nested.need_vmcs12_to_shadow_sync = true; 4137 4138 /* in case we halted in L2 */ 4139 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4140 4141 if (likely(!vmx->fail)) { 4142 /* 4143 * TODO: SDM says that with acknowledge interrupt on 4144 * exit, bit 31 of the VM-exit interrupt information 4145 * (valid interrupt) is always set to 1 on 4146 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't 4147 * need kvm_cpu_has_interrupt(). See the commit 4148 * message for details. 4149 */ 4150 if (nested_exit_intr_ack_set(vcpu) && 4151 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4152 kvm_cpu_has_interrupt(vcpu)) { 4153 int irq = kvm_cpu_get_interrupt(vcpu); 4154 WARN_ON(irq < 0); 4155 vmcs12->vm_exit_intr_info = irq | 4156 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4157 } 4158 4159 if (exit_reason != -1) 4160 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4161 vmcs12->exit_qualification, 4162 vmcs12->idt_vectoring_info_field, 4163 vmcs12->vm_exit_intr_info, 4164 vmcs12->vm_exit_intr_error_code, 4165 KVM_ISA_VMX); 4166 4167 load_vmcs12_host_state(vcpu, vmcs12); 4168 4169 return; 4170 } 4171 4172 /* 4173 * After an early L2 VM-entry failure, we're now back 4174 * in L1 which thinks it just finished a VMLAUNCH or 4175 * VMRESUME instruction, so we need to set the failure 4176 * flag and the VM-instruction error field of the VMCS 4177 * accordingly, and skip the emulated instruction. 4178 */ 4179 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4180 4181 /* 4182 * Restore L1's host state to KVM's software model. We're here 4183 * because a consistency check was caught by hardware, which 4184 * means some amount of guest state has been propagated to KVM's 4185 * model and needs to be unwound to the host's state. 4186 */ 4187 nested_vmx_restore_host_state(vcpu); 4188 4189 vmx->fail = 0; 4190 } 4191 4192 /* 4193 * Decode the memory-address operand of a vmx instruction, as recorded on an 4194 * exit caused by such an instruction (run by a guest hypervisor). 4195 * On success, returns 0. When the operand is invalid, returns 1 and throws 4196 * #UD or #GP. 4197 */ 4198 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4199 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4200 { 4201 gva_t off; 4202 bool exn; 4203 struct kvm_segment s; 4204 4205 /* 4206 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4207 * Execution", on an exit, vmx_instruction_info holds most of the 4208 * addressing components of the operand. Only the displacement part 4209 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4210 * For how an actual address is calculated from all these components, 4211 * refer to Vol. 1, "Operand Addressing". 4212 */ 4213 int scaling = vmx_instruction_info & 3; 4214 int addr_size = (vmx_instruction_info >> 7) & 7; 4215 bool is_reg = vmx_instruction_info & (1u << 10); 4216 int seg_reg = (vmx_instruction_info >> 15) & 7; 4217 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4218 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4219 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4220 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4221 4222 if (is_reg) { 4223 kvm_queue_exception(vcpu, UD_VECTOR); 4224 return 1; 4225 } 4226 4227 /* Addr = segment_base + offset */ 4228 /* offset = base + [index * scale] + displacement */ 4229 off = exit_qualification; /* holds the displacement */ 4230 if (addr_size == 1) 4231 off = (gva_t)sign_extend64(off, 31); 4232 else if (addr_size == 0) 4233 off = (gva_t)sign_extend64(off, 15); 4234 if (base_is_valid) 4235 off += kvm_register_read(vcpu, base_reg); 4236 if (index_is_valid) 4237 off += kvm_register_read(vcpu, index_reg)<<scaling; 4238 vmx_get_segment(vcpu, &s, seg_reg); 4239 4240 /* 4241 * The effective address, i.e. @off, of a memory operand is truncated 4242 * based on the address size of the instruction. Note that this is 4243 * the *effective address*, i.e. the address prior to accounting for 4244 * the segment's base. 4245 */ 4246 if (addr_size == 1) /* 32 bit */ 4247 off &= 0xffffffff; 4248 else if (addr_size == 0) /* 16 bit */ 4249 off &= 0xffff; 4250 4251 /* Checks for #GP/#SS exceptions. */ 4252 exn = false; 4253 if (is_long_mode(vcpu)) { 4254 /* 4255 * The virtual/linear address is never truncated in 64-bit 4256 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4257 * address when using FS/GS with a non-zero base. 4258 */ 4259 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4260 *ret = s.base + off; 4261 else 4262 *ret = off; 4263 4264 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4265 * non-canonical form. This is the only check on the memory 4266 * destination for long mode! 4267 */ 4268 exn = is_noncanonical_address(*ret, vcpu); 4269 } else { 4270 /* 4271 * When not in long mode, the virtual/linear address is 4272 * unconditionally truncated to 32 bits regardless of the 4273 * address size. 4274 */ 4275 *ret = (s.base + off) & 0xffffffff; 4276 4277 /* Protected mode: apply checks for segment validity in the 4278 * following order: 4279 * - segment type check (#GP(0) may be thrown) 4280 * - usability check (#GP(0)/#SS(0)) 4281 * - limit check (#GP(0)/#SS(0)) 4282 */ 4283 if (wr) 4284 /* #GP(0) if the destination operand is located in a 4285 * read-only data segment or any code segment. 4286 */ 4287 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4288 else 4289 /* #GP(0) if the source operand is located in an 4290 * execute-only code segment 4291 */ 4292 exn = ((s.type & 0xa) == 8); 4293 if (exn) { 4294 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4295 return 1; 4296 } 4297 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4298 */ 4299 exn = (s.unusable != 0); 4300 4301 /* 4302 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4303 * outside the segment limit. All CPUs that support VMX ignore 4304 * limit checks for flat segments, i.e. segments with base==0, 4305 * limit==0xffffffff and of type expand-up data or code. 4306 */ 4307 if (!(s.base == 0 && s.limit == 0xffffffff && 4308 ((s.type & 8) || !(s.type & 4)))) 4309 exn = exn || ((u64)off + len - 1 > s.limit); 4310 } 4311 if (exn) { 4312 kvm_queue_exception_e(vcpu, 4313 seg_reg == VCPU_SREG_SS ? 4314 SS_VECTOR : GP_VECTOR, 4315 0); 4316 return 1; 4317 } 4318 4319 return 0; 4320 } 4321 4322 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) 4323 { 4324 gva_t gva; 4325 struct x86_exception e; 4326 4327 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4328 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4329 sizeof(*vmpointer), &gva)) 4330 return 1; 4331 4332 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { 4333 kvm_inject_page_fault(vcpu, &e); 4334 return 1; 4335 } 4336 4337 return 0; 4338 } 4339 4340 /* 4341 * Allocate a shadow VMCS and associate it with the currently loaded 4342 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4343 * VMCS is also VMCLEARed, so that it is ready for use. 4344 */ 4345 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4346 { 4347 struct vcpu_vmx *vmx = to_vmx(vcpu); 4348 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4349 4350 /* 4351 * We should allocate a shadow vmcs for vmcs01 only when L1 4352 * executes VMXON and free it when L1 executes VMXOFF. 4353 * As it is invalid to execute VMXON twice, we shouldn't reach 4354 * here when vmcs01 already have an allocated shadow vmcs. 4355 */ 4356 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4357 4358 if (!loaded_vmcs->shadow_vmcs) { 4359 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4360 if (loaded_vmcs->shadow_vmcs) 4361 vmcs_clear(loaded_vmcs->shadow_vmcs); 4362 } 4363 return loaded_vmcs->shadow_vmcs; 4364 } 4365 4366 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4367 { 4368 struct vcpu_vmx *vmx = to_vmx(vcpu); 4369 int r; 4370 4371 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4372 if (r < 0) 4373 goto out_vmcs02; 4374 4375 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4376 if (!vmx->nested.cached_vmcs12) 4377 goto out_cached_vmcs12; 4378 4379 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4380 if (!vmx->nested.cached_shadow_vmcs12) 4381 goto out_cached_shadow_vmcs12; 4382 4383 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4384 goto out_shadow_vmcs; 4385 4386 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4387 HRTIMER_MODE_REL_PINNED); 4388 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4389 4390 vmx->nested.vpid02 = allocate_vpid(); 4391 4392 vmx->nested.vmcs02_initialized = false; 4393 vmx->nested.vmxon = true; 4394 4395 if (pt_mode == PT_MODE_HOST_GUEST) { 4396 vmx->pt_desc.guest.ctl = 0; 4397 pt_update_intercept_for_msr(vmx); 4398 } 4399 4400 return 0; 4401 4402 out_shadow_vmcs: 4403 kfree(vmx->nested.cached_shadow_vmcs12); 4404 4405 out_cached_shadow_vmcs12: 4406 kfree(vmx->nested.cached_vmcs12); 4407 4408 out_cached_vmcs12: 4409 free_loaded_vmcs(&vmx->nested.vmcs02); 4410 4411 out_vmcs02: 4412 return -ENOMEM; 4413 } 4414 4415 /* 4416 * Emulate the VMXON instruction. 4417 * Currently, we just remember that VMX is active, and do not save or even 4418 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4419 * do not currently need to store anything in that guest-allocated memory 4420 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4421 * argument is different from the VMXON pointer (which the spec says they do). 4422 */ 4423 static int handle_vmon(struct kvm_vcpu *vcpu) 4424 { 4425 int ret; 4426 gpa_t vmptr; 4427 uint32_t revision; 4428 struct vcpu_vmx *vmx = to_vmx(vcpu); 4429 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 4430 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 4431 4432 /* 4433 * The Intel VMX Instruction Reference lists a bunch of bits that are 4434 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4435 * 1 (see vmx_set_cr4() for when we allow the guest to set this). 4436 * Otherwise, we should fail with #UD. But most faulting conditions 4437 * have already been checked by hardware, prior to the VM-exit for 4438 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4439 * that bit set to 1 in non-root mode. 4440 */ 4441 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4442 kvm_queue_exception(vcpu, UD_VECTOR); 4443 return 1; 4444 } 4445 4446 /* CPL=0 must be checked manually. */ 4447 if (vmx_get_cpl(vcpu)) { 4448 kvm_inject_gp(vcpu, 0); 4449 return 1; 4450 } 4451 4452 if (vmx->nested.vmxon) 4453 return nested_vmx_failValid(vcpu, 4454 VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4455 4456 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4457 != VMXON_NEEDED_FEATURES) { 4458 kvm_inject_gp(vcpu, 0); 4459 return 1; 4460 } 4461 4462 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4463 return 1; 4464 4465 /* 4466 * SDM 3: 24.11.5 4467 * The first 4 bytes of VMXON region contain the supported 4468 * VMCS revision identifier 4469 * 4470 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4471 * which replaces physical address width with 32 4472 */ 4473 if (!page_address_valid(vcpu, vmptr)) 4474 return nested_vmx_failInvalid(vcpu); 4475 4476 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4477 revision != VMCS12_REVISION) 4478 return nested_vmx_failInvalid(vcpu); 4479 4480 vmx->nested.vmxon_ptr = vmptr; 4481 ret = enter_vmx_operation(vcpu); 4482 if (ret) 4483 return ret; 4484 4485 return nested_vmx_succeed(vcpu); 4486 } 4487 4488 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4489 { 4490 struct vcpu_vmx *vmx = to_vmx(vcpu); 4491 4492 if (vmx->nested.current_vmptr == -1ull) 4493 return; 4494 4495 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4496 4497 if (enable_shadow_vmcs) { 4498 /* copy to memory all shadowed fields in case 4499 they were modified */ 4500 copy_shadow_to_vmcs12(vmx); 4501 vmx_disable_shadow_vmcs(vmx); 4502 } 4503 vmx->nested.posted_intr_nv = -1; 4504 4505 /* Flush VMCS12 to guest memory */ 4506 kvm_vcpu_write_guest_page(vcpu, 4507 vmx->nested.current_vmptr >> PAGE_SHIFT, 4508 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4509 4510 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4511 4512 vmx->nested.current_vmptr = -1ull; 4513 } 4514 4515 /* Emulate the VMXOFF instruction */ 4516 static int handle_vmoff(struct kvm_vcpu *vcpu) 4517 { 4518 if (!nested_vmx_check_permission(vcpu)) 4519 return 1; 4520 4521 free_nested(vcpu); 4522 4523 /* Process a latched INIT during time CPU was in VMX operation */ 4524 kvm_make_request(KVM_REQ_EVENT, vcpu); 4525 4526 return nested_vmx_succeed(vcpu); 4527 } 4528 4529 /* Emulate the VMCLEAR instruction */ 4530 static int handle_vmclear(struct kvm_vcpu *vcpu) 4531 { 4532 struct vcpu_vmx *vmx = to_vmx(vcpu); 4533 u32 zero = 0; 4534 gpa_t vmptr; 4535 u64 evmcs_gpa; 4536 4537 if (!nested_vmx_check_permission(vcpu)) 4538 return 1; 4539 4540 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4541 return 1; 4542 4543 if (!page_address_valid(vcpu, vmptr)) 4544 return nested_vmx_failValid(vcpu, 4545 VMXERR_VMCLEAR_INVALID_ADDRESS); 4546 4547 if (vmptr == vmx->nested.vmxon_ptr) 4548 return nested_vmx_failValid(vcpu, 4549 VMXERR_VMCLEAR_VMXON_POINTER); 4550 4551 /* 4552 * When Enlightened VMEntry is enabled on the calling CPU we treat 4553 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 4554 * way to distinguish it from VMCS12) and we must not corrupt it by 4555 * writing to the non-existent 'launch_state' field. The area doesn't 4556 * have to be the currently active EVMCS on the calling CPU and there's 4557 * nothing KVM has to do to transition it from 'active' to 'non-active' 4558 * state. It is possible that the area will stay mapped as 4559 * vmx->nested.hv_evmcs but this shouldn't be a problem. 4560 */ 4561 if (likely(!vmx->nested.enlightened_vmcs_enabled || 4562 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 4563 if (vmptr == vmx->nested.current_vmptr) 4564 nested_release_vmcs12(vcpu); 4565 4566 kvm_vcpu_write_guest(vcpu, 4567 vmptr + offsetof(struct vmcs12, 4568 launch_state), 4569 &zero, sizeof(zero)); 4570 } 4571 4572 return nested_vmx_succeed(vcpu); 4573 } 4574 4575 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); 4576 4577 /* Emulate the VMLAUNCH instruction */ 4578 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 4579 { 4580 return nested_vmx_run(vcpu, true); 4581 } 4582 4583 /* Emulate the VMRESUME instruction */ 4584 static int handle_vmresume(struct kvm_vcpu *vcpu) 4585 { 4586 4587 return nested_vmx_run(vcpu, false); 4588 } 4589 4590 static int handle_vmread(struct kvm_vcpu *vcpu) 4591 { 4592 unsigned long field; 4593 u64 field_value; 4594 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4595 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4596 int len; 4597 gva_t gva = 0; 4598 struct vmcs12 *vmcs12; 4599 struct x86_exception e; 4600 short offset; 4601 4602 if (!nested_vmx_check_permission(vcpu)) 4603 return 1; 4604 4605 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) 4606 return nested_vmx_failInvalid(vcpu); 4607 4608 if (!is_guest_mode(vcpu)) 4609 vmcs12 = get_vmcs12(vcpu); 4610 else { 4611 /* 4612 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD 4613 * to shadowed-field sets the ALU flags for VMfailInvalid. 4614 */ 4615 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4616 return nested_vmx_failInvalid(vcpu); 4617 vmcs12 = get_shadow_vmcs12(vcpu); 4618 } 4619 4620 /* Decode instruction info and find the field to read */ 4621 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4622 4623 offset = vmcs_field_to_offset(field); 4624 if (offset < 0) 4625 return nested_vmx_failValid(vcpu, 4626 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4627 4628 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 4629 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4630 4631 /* Read the field, zero-extended to a u64 field_value */ 4632 field_value = vmcs12_read_any(vmcs12, field, offset); 4633 4634 /* 4635 * Now copy part of this value to register or memory, as requested. 4636 * Note that the number of bits actually copied is 32 or 64 depending 4637 * on the guest's mode (32 or 64 bit), not on the given field's length. 4638 */ 4639 if (vmx_instruction_info & (1u << 10)) { 4640 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 4641 field_value); 4642 } else { 4643 len = is_64_bit_mode(vcpu) ? 8 : 4; 4644 if (get_vmx_mem_address(vcpu, exit_qualification, 4645 vmx_instruction_info, true, len, &gva)) 4646 return 1; 4647 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4648 if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) 4649 kvm_inject_page_fault(vcpu, &e); 4650 } 4651 4652 return nested_vmx_succeed(vcpu); 4653 } 4654 4655 static bool is_shadow_field_rw(unsigned long field) 4656 { 4657 switch (field) { 4658 #define SHADOW_FIELD_RW(x, y) case x: 4659 #include "vmcs_shadow_fields.h" 4660 return true; 4661 default: 4662 break; 4663 } 4664 return false; 4665 } 4666 4667 static bool is_shadow_field_ro(unsigned long field) 4668 { 4669 switch (field) { 4670 #define SHADOW_FIELD_RO(x, y) case x: 4671 #include "vmcs_shadow_fields.h" 4672 return true; 4673 default: 4674 break; 4675 } 4676 return false; 4677 } 4678 4679 static int handle_vmwrite(struct kvm_vcpu *vcpu) 4680 { 4681 unsigned long field; 4682 int len; 4683 gva_t gva; 4684 struct vcpu_vmx *vmx = to_vmx(vcpu); 4685 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4686 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4687 4688 /* The value to write might be 32 or 64 bits, depending on L1's long 4689 * mode, and eventually we need to write that into a field of several 4690 * possible lengths. The code below first zero-extends the value to 64 4691 * bit (field_value), and then copies only the appropriate number of 4692 * bits into the vmcs12 field. 4693 */ 4694 u64 field_value = 0; 4695 struct x86_exception e; 4696 struct vmcs12 *vmcs12; 4697 short offset; 4698 4699 if (!nested_vmx_check_permission(vcpu)) 4700 return 1; 4701 4702 if (vmx->nested.current_vmptr == -1ull) 4703 return nested_vmx_failInvalid(vcpu); 4704 4705 if (vmx_instruction_info & (1u << 10)) 4706 field_value = kvm_register_readl(vcpu, 4707 (((vmx_instruction_info) >> 3) & 0xf)); 4708 else { 4709 len = is_64_bit_mode(vcpu) ? 8 : 4; 4710 if (get_vmx_mem_address(vcpu, exit_qualification, 4711 vmx_instruction_info, false, len, &gva)) 4712 return 1; 4713 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { 4714 kvm_inject_page_fault(vcpu, &e); 4715 return 1; 4716 } 4717 } 4718 4719 4720 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4721 /* 4722 * If the vCPU supports "VMWRITE to any supported field in the 4723 * VMCS," then the "read-only" fields are actually read/write. 4724 */ 4725 if (vmcs_field_readonly(field) && 4726 !nested_cpu_has_vmwrite_any_field(vcpu)) 4727 return nested_vmx_failValid(vcpu, 4728 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 4729 4730 if (!is_guest_mode(vcpu)) { 4731 vmcs12 = get_vmcs12(vcpu); 4732 4733 /* 4734 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 4735 * vmcs12, else we may crush a field or consume a stale value. 4736 */ 4737 if (!is_shadow_field_rw(field)) 4738 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4739 } else { 4740 /* 4741 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE 4742 * to shadowed-field sets the ALU flags for VMfailInvalid. 4743 */ 4744 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4745 return nested_vmx_failInvalid(vcpu); 4746 vmcs12 = get_shadow_vmcs12(vcpu); 4747 } 4748 4749 offset = vmcs_field_to_offset(field); 4750 if (offset < 0) 4751 return nested_vmx_failValid(vcpu, 4752 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4753 4754 /* 4755 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 4756 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 4757 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 4758 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 4759 * from L1 will return a different value than VMREAD from L2 (L1 sees 4760 * the stripped down value, L2 sees the full value as stored by KVM). 4761 */ 4762 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 4763 field_value &= 0x1f0ff; 4764 4765 vmcs12_write_any(vmcs12, field, offset, field_value); 4766 4767 /* 4768 * Do not track vmcs12 dirty-state if in guest-mode as we actually 4769 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 4770 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 4771 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 4772 */ 4773 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 4774 /* 4775 * L1 can read these fields without exiting, ensure the 4776 * shadow VMCS is up-to-date. 4777 */ 4778 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 4779 preempt_disable(); 4780 vmcs_load(vmx->vmcs01.shadow_vmcs); 4781 4782 __vmcs_writel(field, field_value); 4783 4784 vmcs_clear(vmx->vmcs01.shadow_vmcs); 4785 vmcs_load(vmx->loaded_vmcs->vmcs); 4786 preempt_enable(); 4787 } 4788 vmx->nested.dirty_vmcs12 = true; 4789 } 4790 4791 return nested_vmx_succeed(vcpu); 4792 } 4793 4794 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 4795 { 4796 vmx->nested.current_vmptr = vmptr; 4797 if (enable_shadow_vmcs) { 4798 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 4799 vmcs_write64(VMCS_LINK_POINTER, 4800 __pa(vmx->vmcs01.shadow_vmcs)); 4801 vmx->nested.need_vmcs12_to_shadow_sync = true; 4802 } 4803 vmx->nested.dirty_vmcs12 = true; 4804 } 4805 4806 /* Emulate the VMPTRLD instruction */ 4807 static int handle_vmptrld(struct kvm_vcpu *vcpu) 4808 { 4809 struct vcpu_vmx *vmx = to_vmx(vcpu); 4810 gpa_t vmptr; 4811 4812 if (!nested_vmx_check_permission(vcpu)) 4813 return 1; 4814 4815 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4816 return 1; 4817 4818 if (!page_address_valid(vcpu, vmptr)) 4819 return nested_vmx_failValid(vcpu, 4820 VMXERR_VMPTRLD_INVALID_ADDRESS); 4821 4822 if (vmptr == vmx->nested.vmxon_ptr) 4823 return nested_vmx_failValid(vcpu, 4824 VMXERR_VMPTRLD_VMXON_POINTER); 4825 4826 /* Forbid normal VMPTRLD if Enlightened version was used */ 4827 if (vmx->nested.hv_evmcs) 4828 return 1; 4829 4830 if (vmx->nested.current_vmptr != vmptr) { 4831 struct kvm_host_map map; 4832 struct vmcs12 *new_vmcs12; 4833 4834 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 4835 /* 4836 * Reads from an unbacked page return all 1s, 4837 * which means that the 32 bits located at the 4838 * given physical address won't match the required 4839 * VMCS12_REVISION identifier. 4840 */ 4841 return nested_vmx_failValid(vcpu, 4842 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4843 } 4844 4845 new_vmcs12 = map.hva; 4846 4847 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 4848 (new_vmcs12->hdr.shadow_vmcs && 4849 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 4850 kvm_vcpu_unmap(vcpu, &map, false); 4851 return nested_vmx_failValid(vcpu, 4852 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4853 } 4854 4855 nested_release_vmcs12(vcpu); 4856 4857 /* 4858 * Load VMCS12 from guest memory since it is not already 4859 * cached. 4860 */ 4861 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 4862 kvm_vcpu_unmap(vcpu, &map, false); 4863 4864 set_current_vmptr(vmx, vmptr); 4865 } 4866 4867 return nested_vmx_succeed(vcpu); 4868 } 4869 4870 /* Emulate the VMPTRST instruction */ 4871 static int handle_vmptrst(struct kvm_vcpu *vcpu) 4872 { 4873 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); 4874 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4875 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 4876 struct x86_exception e; 4877 gva_t gva; 4878 4879 if (!nested_vmx_check_permission(vcpu)) 4880 return 1; 4881 4882 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) 4883 return 1; 4884 4885 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 4886 true, sizeof(gpa_t), &gva)) 4887 return 1; 4888 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 4889 if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 4890 sizeof(gpa_t), &e)) { 4891 kvm_inject_page_fault(vcpu, &e); 4892 return 1; 4893 } 4894 return nested_vmx_succeed(vcpu); 4895 } 4896 4897 /* Emulate the INVEPT instruction */ 4898 static int handle_invept(struct kvm_vcpu *vcpu) 4899 { 4900 struct vcpu_vmx *vmx = to_vmx(vcpu); 4901 u32 vmx_instruction_info, types; 4902 unsigned long type; 4903 gva_t gva; 4904 struct x86_exception e; 4905 struct { 4906 u64 eptp, gpa; 4907 } operand; 4908 4909 if (!(vmx->nested.msrs.secondary_ctls_high & 4910 SECONDARY_EXEC_ENABLE_EPT) || 4911 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 4912 kvm_queue_exception(vcpu, UD_VECTOR); 4913 return 1; 4914 } 4915 4916 if (!nested_vmx_check_permission(vcpu)) 4917 return 1; 4918 4919 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4920 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 4921 4922 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 4923 4924 if (type >= 32 || !(types & (1 << type))) 4925 return nested_vmx_failValid(vcpu, 4926 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4927 4928 /* According to the Intel VMX instruction reference, the memory 4929 * operand is read even if it isn't needed (e.g., for type==global) 4930 */ 4931 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4932 vmx_instruction_info, false, sizeof(operand), &gva)) 4933 return 1; 4934 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 4935 kvm_inject_page_fault(vcpu, &e); 4936 return 1; 4937 } 4938 4939 switch (type) { 4940 case VMX_EPT_EXTENT_GLOBAL: 4941 case VMX_EPT_EXTENT_CONTEXT: 4942 /* 4943 * TODO: Sync the necessary shadow EPT roots here, rather than 4944 * at the next emulated VM-entry. 4945 */ 4946 break; 4947 default: 4948 BUG_ON(1); 4949 break; 4950 } 4951 4952 return nested_vmx_succeed(vcpu); 4953 } 4954 4955 static int handle_invvpid(struct kvm_vcpu *vcpu) 4956 { 4957 struct vcpu_vmx *vmx = to_vmx(vcpu); 4958 u32 vmx_instruction_info; 4959 unsigned long type, types; 4960 gva_t gva; 4961 struct x86_exception e; 4962 struct { 4963 u64 vpid; 4964 u64 gla; 4965 } operand; 4966 u16 vpid02; 4967 4968 if (!(vmx->nested.msrs.secondary_ctls_high & 4969 SECONDARY_EXEC_ENABLE_VPID) || 4970 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 4971 kvm_queue_exception(vcpu, UD_VECTOR); 4972 return 1; 4973 } 4974 4975 if (!nested_vmx_check_permission(vcpu)) 4976 return 1; 4977 4978 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4979 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 4980 4981 types = (vmx->nested.msrs.vpid_caps & 4982 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 4983 4984 if (type >= 32 || !(types & (1 << type))) 4985 return nested_vmx_failValid(vcpu, 4986 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4987 4988 /* according to the intel vmx instruction reference, the memory 4989 * operand is read even if it isn't needed (e.g., for type==global) 4990 */ 4991 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4992 vmx_instruction_info, false, sizeof(operand), &gva)) 4993 return 1; 4994 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 4995 kvm_inject_page_fault(vcpu, &e); 4996 return 1; 4997 } 4998 if (operand.vpid >> 16) 4999 return nested_vmx_failValid(vcpu, 5000 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5001 5002 vpid02 = nested_get_vpid02(vcpu); 5003 switch (type) { 5004 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5005 if (!operand.vpid || 5006 is_noncanonical_address(operand.gla, vcpu)) 5007 return nested_vmx_failValid(vcpu, 5008 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5009 if (cpu_has_vmx_invvpid_individual_addr()) { 5010 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, 5011 vpid02, operand.gla); 5012 } else 5013 __vmx_flush_tlb(vcpu, vpid02, false); 5014 break; 5015 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5016 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5017 if (!operand.vpid) 5018 return nested_vmx_failValid(vcpu, 5019 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5020 __vmx_flush_tlb(vcpu, vpid02, false); 5021 break; 5022 case VMX_VPID_EXTENT_ALL_CONTEXT: 5023 __vmx_flush_tlb(vcpu, vpid02, false); 5024 break; 5025 default: 5026 WARN_ON_ONCE(1); 5027 return kvm_skip_emulated_instruction(vcpu); 5028 } 5029 5030 return nested_vmx_succeed(vcpu); 5031 } 5032 5033 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5034 struct vmcs12 *vmcs12) 5035 { 5036 u32 index = kvm_rcx_read(vcpu); 5037 u64 address; 5038 bool accessed_dirty; 5039 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 5040 5041 if (!nested_cpu_has_eptp_switching(vmcs12) || 5042 !nested_cpu_has_ept(vmcs12)) 5043 return 1; 5044 5045 if (index >= VMFUNC_EPTP_ENTRIES) 5046 return 1; 5047 5048 5049 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5050 &address, index * 8, 8)) 5051 return 1; 5052 5053 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); 5054 5055 /* 5056 * If the (L2) guest does a vmfunc to the currently 5057 * active ept pointer, we don't have to do anything else 5058 */ 5059 if (vmcs12->ept_pointer != address) { 5060 if (!valid_ept_address(vcpu, address)) 5061 return 1; 5062 5063 kvm_mmu_unload(vcpu); 5064 mmu->ept_ad = accessed_dirty; 5065 mmu->mmu_role.base.ad_disabled = !accessed_dirty; 5066 vmcs12->ept_pointer = address; 5067 /* 5068 * TODO: Check what's the correct approach in case 5069 * mmu reload fails. Currently, we just let the next 5070 * reload potentially fail 5071 */ 5072 kvm_mmu_reload(vcpu); 5073 } 5074 5075 return 0; 5076 } 5077 5078 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5079 { 5080 struct vcpu_vmx *vmx = to_vmx(vcpu); 5081 struct vmcs12 *vmcs12; 5082 u32 function = kvm_rax_read(vcpu); 5083 5084 /* 5085 * VMFUNC is only supported for nested guests, but we always enable the 5086 * secondary control for simplicity; for non-nested mode, fake that we 5087 * didn't by injecting #UD. 5088 */ 5089 if (!is_guest_mode(vcpu)) { 5090 kvm_queue_exception(vcpu, UD_VECTOR); 5091 return 1; 5092 } 5093 5094 vmcs12 = get_vmcs12(vcpu); 5095 if ((vmcs12->vm_function_control & (1 << function)) == 0) 5096 goto fail; 5097 5098 switch (function) { 5099 case 0: 5100 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5101 goto fail; 5102 break; 5103 default: 5104 goto fail; 5105 } 5106 return kvm_skip_emulated_instruction(vcpu); 5107 5108 fail: 5109 nested_vmx_vmexit(vcpu, vmx->exit_reason, 5110 vmcs_read32(VM_EXIT_INTR_INFO), 5111 vmcs_readl(EXIT_QUALIFICATION)); 5112 return 1; 5113 } 5114 5115 5116 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5117 struct vmcs12 *vmcs12) 5118 { 5119 unsigned long exit_qualification; 5120 gpa_t bitmap, last_bitmap; 5121 unsigned int port; 5122 int size; 5123 u8 b; 5124 5125 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5126 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5127 5128 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5129 5130 port = exit_qualification >> 16; 5131 size = (exit_qualification & 7) + 1; 5132 5133 last_bitmap = (gpa_t)-1; 5134 b = -1; 5135 5136 while (size > 0) { 5137 if (port < 0x8000) 5138 bitmap = vmcs12->io_bitmap_a; 5139 else if (port < 0x10000) 5140 bitmap = vmcs12->io_bitmap_b; 5141 else 5142 return true; 5143 bitmap += (port & 0x7fff) / 8; 5144 5145 if (last_bitmap != bitmap) 5146 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5147 return true; 5148 if (b & (1 << (port & 7))) 5149 return true; 5150 5151 port++; 5152 size--; 5153 last_bitmap = bitmap; 5154 } 5155 5156 return false; 5157 } 5158 5159 /* 5160 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 5161 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5162 * disinterest in the current event (read or write a specific MSR) by using an 5163 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5164 */ 5165 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5166 struct vmcs12 *vmcs12, u32 exit_reason) 5167 { 5168 u32 msr_index = kvm_rcx_read(vcpu); 5169 gpa_t bitmap; 5170 5171 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5172 return true; 5173 5174 /* 5175 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5176 * for the four combinations of read/write and low/high MSR numbers. 5177 * First we need to figure out which of the four to use: 5178 */ 5179 bitmap = vmcs12->msr_bitmap; 5180 if (exit_reason == EXIT_REASON_MSR_WRITE) 5181 bitmap += 2048; 5182 if (msr_index >= 0xc0000000) { 5183 msr_index -= 0xc0000000; 5184 bitmap += 1024; 5185 } 5186 5187 /* Then read the msr_index'th bit from this bitmap: */ 5188 if (msr_index < 1024*8) { 5189 unsigned char b; 5190 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5191 return true; 5192 return 1 & (b >> (msr_index & 7)); 5193 } else 5194 return true; /* let L1 handle the wrong parameter */ 5195 } 5196 5197 /* 5198 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5199 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5200 * intercept (via guest_host_mask etc.) the current event. 5201 */ 5202 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5203 struct vmcs12 *vmcs12) 5204 { 5205 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5206 int cr = exit_qualification & 15; 5207 int reg; 5208 unsigned long val; 5209 5210 switch ((exit_qualification >> 4) & 3) { 5211 case 0: /* mov to cr */ 5212 reg = (exit_qualification >> 8) & 15; 5213 val = kvm_register_readl(vcpu, reg); 5214 switch (cr) { 5215 case 0: 5216 if (vmcs12->cr0_guest_host_mask & 5217 (val ^ vmcs12->cr0_read_shadow)) 5218 return true; 5219 break; 5220 case 3: 5221 if ((vmcs12->cr3_target_count >= 1 && 5222 vmcs12->cr3_target_value0 == val) || 5223 (vmcs12->cr3_target_count >= 2 && 5224 vmcs12->cr3_target_value1 == val) || 5225 (vmcs12->cr3_target_count >= 3 && 5226 vmcs12->cr3_target_value2 == val) || 5227 (vmcs12->cr3_target_count >= 4 && 5228 vmcs12->cr3_target_value3 == val)) 5229 return false; 5230 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5231 return true; 5232 break; 5233 case 4: 5234 if (vmcs12->cr4_guest_host_mask & 5235 (vmcs12->cr4_read_shadow ^ val)) 5236 return true; 5237 break; 5238 case 8: 5239 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5240 return true; 5241 break; 5242 } 5243 break; 5244 case 2: /* clts */ 5245 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5246 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5247 return true; 5248 break; 5249 case 1: /* mov from cr */ 5250 switch (cr) { 5251 case 3: 5252 if (vmcs12->cpu_based_vm_exec_control & 5253 CPU_BASED_CR3_STORE_EXITING) 5254 return true; 5255 break; 5256 case 8: 5257 if (vmcs12->cpu_based_vm_exec_control & 5258 CPU_BASED_CR8_STORE_EXITING) 5259 return true; 5260 break; 5261 } 5262 break; 5263 case 3: /* lmsw */ 5264 /* 5265 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5266 * cr0. Other attempted changes are ignored, with no exit. 5267 */ 5268 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5269 if (vmcs12->cr0_guest_host_mask & 0xe & 5270 (val ^ vmcs12->cr0_read_shadow)) 5271 return true; 5272 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5273 !(vmcs12->cr0_read_shadow & 0x1) && 5274 (val & 0x1)) 5275 return true; 5276 break; 5277 } 5278 return false; 5279 } 5280 5281 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5282 struct vmcs12 *vmcs12, gpa_t bitmap) 5283 { 5284 u32 vmx_instruction_info; 5285 unsigned long field; 5286 u8 b; 5287 5288 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5289 return true; 5290 5291 /* Decode instruction info and find the field to access */ 5292 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5293 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5294 5295 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5296 if (field >> 15) 5297 return true; 5298 5299 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5300 return true; 5301 5302 return 1 & (b >> (field & 7)); 5303 } 5304 5305 /* 5306 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 5307 * should handle it ourselves in L0 (and then continue L2). Only call this 5308 * when in is_guest_mode (L2). 5309 */ 5310 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) 5311 { 5312 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 5313 struct vcpu_vmx *vmx = to_vmx(vcpu); 5314 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5315 5316 if (vmx->nested.nested_run_pending) 5317 return false; 5318 5319 if (unlikely(vmx->fail)) { 5320 trace_kvm_nested_vmenter_failed( 5321 "hardware VM-instruction error: ", 5322 vmcs_read32(VM_INSTRUCTION_ERROR)); 5323 return true; 5324 } 5325 5326 /* 5327 * The host physical addresses of some pages of guest memory 5328 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 5329 * Page). The CPU may write to these pages via their host 5330 * physical address while L2 is running, bypassing any 5331 * address-translation-based dirty tracking (e.g. EPT write 5332 * protection). 5333 * 5334 * Mark them dirty on every exit from L2 to prevent them from 5335 * getting out of sync with dirty tracking. 5336 */ 5337 nested_mark_vmcs12_pages_dirty(vcpu); 5338 5339 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 5340 vmcs_readl(EXIT_QUALIFICATION), 5341 vmx->idt_vectoring_info, 5342 intr_info, 5343 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 5344 KVM_ISA_VMX); 5345 5346 switch (exit_reason) { 5347 case EXIT_REASON_EXCEPTION_NMI: 5348 if (is_nmi(intr_info)) 5349 return false; 5350 else if (is_page_fault(intr_info)) 5351 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; 5352 else if (is_debug(intr_info) && 5353 vcpu->guest_debug & 5354 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5355 return false; 5356 else if (is_breakpoint(intr_info) && 5357 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5358 return false; 5359 return vmcs12->exception_bitmap & 5360 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5361 case EXIT_REASON_EXTERNAL_INTERRUPT: 5362 return false; 5363 case EXIT_REASON_TRIPLE_FAULT: 5364 return true; 5365 case EXIT_REASON_PENDING_INTERRUPT: 5366 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 5367 case EXIT_REASON_NMI_WINDOW: 5368 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 5369 case EXIT_REASON_TASK_SWITCH: 5370 return true; 5371 case EXIT_REASON_CPUID: 5372 return true; 5373 case EXIT_REASON_HLT: 5374 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5375 case EXIT_REASON_INVD: 5376 return true; 5377 case EXIT_REASON_INVLPG: 5378 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5379 case EXIT_REASON_RDPMC: 5380 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5381 case EXIT_REASON_RDRAND: 5382 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5383 case EXIT_REASON_RDSEED: 5384 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5385 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5386 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5387 case EXIT_REASON_VMREAD: 5388 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5389 vmcs12->vmread_bitmap); 5390 case EXIT_REASON_VMWRITE: 5391 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5392 vmcs12->vmwrite_bitmap); 5393 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5394 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5395 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5396 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5397 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5398 /* 5399 * VMX instructions trap unconditionally. This allows L1 to 5400 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5401 */ 5402 return true; 5403 case EXIT_REASON_CR_ACCESS: 5404 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5405 case EXIT_REASON_DR_ACCESS: 5406 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5407 case EXIT_REASON_IO_INSTRUCTION: 5408 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5409 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5410 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5411 case EXIT_REASON_MSR_READ: 5412 case EXIT_REASON_MSR_WRITE: 5413 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5414 case EXIT_REASON_INVALID_STATE: 5415 return true; 5416 case EXIT_REASON_MWAIT_INSTRUCTION: 5417 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5418 case EXIT_REASON_MONITOR_TRAP_FLAG: 5419 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); 5420 case EXIT_REASON_MONITOR_INSTRUCTION: 5421 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5422 case EXIT_REASON_PAUSE_INSTRUCTION: 5423 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5424 nested_cpu_has2(vmcs12, 5425 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5426 case EXIT_REASON_MCE_DURING_VMENTRY: 5427 return false; 5428 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5429 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5430 case EXIT_REASON_APIC_ACCESS: 5431 case EXIT_REASON_APIC_WRITE: 5432 case EXIT_REASON_EOI_INDUCED: 5433 /* 5434 * The controls for "virtualize APIC accesses," "APIC- 5435 * register virtualization," and "virtual-interrupt 5436 * delivery" only come from vmcs12. 5437 */ 5438 return true; 5439 case EXIT_REASON_EPT_VIOLATION: 5440 /* 5441 * L0 always deals with the EPT violation. If nested EPT is 5442 * used, and the nested mmu code discovers that the address is 5443 * missing in the guest EPT table (EPT12), the EPT violation 5444 * will be injected with nested_ept_inject_page_fault() 5445 */ 5446 return false; 5447 case EXIT_REASON_EPT_MISCONFIG: 5448 /* 5449 * L2 never uses directly L1's EPT, but rather L0's own EPT 5450 * table (shadow on EPT) or a merged EPT table that L0 built 5451 * (EPT on EPT). So any problems with the structure of the 5452 * table is L0's fault. 5453 */ 5454 return false; 5455 case EXIT_REASON_INVPCID: 5456 return 5457 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5458 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5459 case EXIT_REASON_WBINVD: 5460 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5461 case EXIT_REASON_XSETBV: 5462 return true; 5463 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5464 /* 5465 * This should never happen, since it is not possible to 5466 * set XSS to a non-zero value---neither in L1 nor in L2. 5467 * If if it were, XSS would have to be checked against 5468 * the XSS exit bitmap in vmcs12. 5469 */ 5470 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5471 case EXIT_REASON_PREEMPTION_TIMER: 5472 return false; 5473 case EXIT_REASON_PML_FULL: 5474 /* We emulate PML support to L1. */ 5475 return false; 5476 case EXIT_REASON_VMFUNC: 5477 /* VM functions are emulated through L2->L0 vmexits. */ 5478 return false; 5479 case EXIT_REASON_ENCLS: 5480 /* SGX is never exposed to L1 */ 5481 return false; 5482 case EXIT_REASON_UMWAIT: 5483 case EXIT_REASON_TPAUSE: 5484 return nested_cpu_has2(vmcs12, 5485 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 5486 default: 5487 return true; 5488 } 5489 } 5490 5491 5492 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 5493 struct kvm_nested_state __user *user_kvm_nested_state, 5494 u32 user_data_size) 5495 { 5496 struct vcpu_vmx *vmx; 5497 struct vmcs12 *vmcs12; 5498 struct kvm_nested_state kvm_state = { 5499 .flags = 0, 5500 .format = KVM_STATE_NESTED_FORMAT_VMX, 5501 .size = sizeof(kvm_state), 5502 .hdr.vmx.vmxon_pa = -1ull, 5503 .hdr.vmx.vmcs12_pa = -1ull, 5504 }; 5505 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5506 &user_kvm_nested_state->data.vmx[0]; 5507 5508 if (!vcpu) 5509 return kvm_state.size + sizeof(*user_vmx_nested_state); 5510 5511 vmx = to_vmx(vcpu); 5512 vmcs12 = get_vmcs12(vcpu); 5513 5514 if (nested_vmx_allowed(vcpu) && 5515 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 5516 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 5517 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 5518 5519 if (vmx_has_valid_vmcs12(vcpu)) { 5520 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 5521 5522 if (vmx->nested.hv_evmcs) 5523 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 5524 5525 if (is_guest_mode(vcpu) && 5526 nested_cpu_has_shadow_vmcs(vmcs12) && 5527 vmcs12->vmcs_link_pointer != -1ull) 5528 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 5529 } 5530 5531 if (vmx->nested.smm.vmxon) 5532 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 5533 5534 if (vmx->nested.smm.guest_mode) 5535 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 5536 5537 if (is_guest_mode(vcpu)) { 5538 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 5539 5540 if (vmx->nested.nested_run_pending) 5541 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 5542 } 5543 } 5544 5545 if (user_data_size < kvm_state.size) 5546 goto out; 5547 5548 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 5549 return -EFAULT; 5550 5551 if (!vmx_has_valid_vmcs12(vcpu)) 5552 goto out; 5553 5554 /* 5555 * When running L2, the authoritative vmcs12 state is in the 5556 * vmcs02. When running L1, the authoritative vmcs12 state is 5557 * in the shadow or enlightened vmcs linked to vmcs01, unless 5558 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 5559 * vmcs12 state is in the vmcs12 already. 5560 */ 5561 if (is_guest_mode(vcpu)) { 5562 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5563 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5564 } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { 5565 if (vmx->nested.hv_evmcs) 5566 copy_enlightened_to_vmcs12(vmx); 5567 else if (enable_shadow_vmcs) 5568 copy_shadow_to_vmcs12(vmx); 5569 } 5570 5571 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 5572 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 5573 5574 /* 5575 * Copy over the full allocated size of vmcs12 rather than just the size 5576 * of the struct. 5577 */ 5578 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 5579 return -EFAULT; 5580 5581 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5582 vmcs12->vmcs_link_pointer != -1ull) { 5583 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 5584 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 5585 return -EFAULT; 5586 } 5587 5588 out: 5589 return kvm_state.size; 5590 } 5591 5592 /* 5593 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 5594 */ 5595 void vmx_leave_nested(struct kvm_vcpu *vcpu) 5596 { 5597 if (is_guest_mode(vcpu)) { 5598 to_vmx(vcpu)->nested.nested_run_pending = 0; 5599 nested_vmx_vmexit(vcpu, -1, 0, 0); 5600 } 5601 free_nested(vcpu); 5602 } 5603 5604 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 5605 struct kvm_nested_state __user *user_kvm_nested_state, 5606 struct kvm_nested_state *kvm_state) 5607 { 5608 struct vcpu_vmx *vmx = to_vmx(vcpu); 5609 struct vmcs12 *vmcs12; 5610 u32 exit_qual; 5611 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5612 &user_kvm_nested_state->data.vmx[0]; 5613 int ret; 5614 5615 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 5616 return -EINVAL; 5617 5618 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { 5619 if (kvm_state->hdr.vmx.smm.flags) 5620 return -EINVAL; 5621 5622 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) 5623 return -EINVAL; 5624 5625 /* 5626 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 5627 * enable eVMCS capability on vCPU. However, since then 5628 * code was changed such that flag signals vmcs12 should 5629 * be copied into eVMCS in guest memory. 5630 * 5631 * To preserve backwards compatability, allow user 5632 * to set this flag even when there is no VMXON region. 5633 */ 5634 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 5635 return -EINVAL; 5636 } else { 5637 if (!nested_vmx_allowed(vcpu)) 5638 return -EINVAL; 5639 5640 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 5641 return -EINVAL; 5642 } 5643 5644 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5645 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5646 return -EINVAL; 5647 5648 if (kvm_state->hdr.vmx.smm.flags & 5649 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 5650 return -EINVAL; 5651 5652 /* 5653 * SMM temporarily disables VMX, so we cannot be in guest mode, 5654 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 5655 * must be zero. 5656 */ 5657 if (is_smm(vcpu) ? 5658 (kvm_state->flags & 5659 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 5660 : kvm_state->hdr.vmx.smm.flags) 5661 return -EINVAL; 5662 5663 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5664 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 5665 return -EINVAL; 5666 5667 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 5668 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 5669 return -EINVAL; 5670 5671 vmx_leave_nested(vcpu); 5672 5673 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) 5674 return 0; 5675 5676 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 5677 ret = enter_vmx_operation(vcpu); 5678 if (ret) 5679 return ret; 5680 5681 /* Empty 'VMXON' state is permitted */ 5682 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) 5683 return 0; 5684 5685 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { 5686 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 5687 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 5688 return -EINVAL; 5689 5690 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 5691 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 5692 /* 5693 * Sync eVMCS upon entry as we may not have 5694 * HV_X64_MSR_VP_ASSIST_PAGE set up yet. 5695 */ 5696 vmx->nested.need_vmcs12_to_shadow_sync = true; 5697 } else { 5698 return -EINVAL; 5699 } 5700 5701 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 5702 vmx->nested.smm.vmxon = true; 5703 vmx->nested.vmxon = false; 5704 5705 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 5706 vmx->nested.smm.guest_mode = true; 5707 } 5708 5709 vmcs12 = get_vmcs12(vcpu); 5710 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 5711 return -EFAULT; 5712 5713 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 5714 return -EINVAL; 5715 5716 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5717 return 0; 5718 5719 vmx->nested.nested_run_pending = 5720 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 5721 5722 ret = -EINVAL; 5723 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5724 vmcs12->vmcs_link_pointer != -1ull) { 5725 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 5726 5727 if (kvm_state->size < 5728 sizeof(*kvm_state) + 5729 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 5730 goto error_guest_mode; 5731 5732 if (copy_from_user(shadow_vmcs12, 5733 user_vmx_nested_state->shadow_vmcs12, 5734 sizeof(*shadow_vmcs12))) { 5735 ret = -EFAULT; 5736 goto error_guest_mode; 5737 } 5738 5739 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 5740 !shadow_vmcs12->hdr.shadow_vmcs) 5741 goto error_guest_mode; 5742 } 5743 5744 if (nested_vmx_check_controls(vcpu, vmcs12) || 5745 nested_vmx_check_host_state(vcpu, vmcs12) || 5746 nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 5747 goto error_guest_mode; 5748 5749 vmx->nested.dirty_vmcs12 = true; 5750 ret = nested_vmx_enter_non_root_mode(vcpu, false); 5751 if (ret) 5752 goto error_guest_mode; 5753 5754 return 0; 5755 5756 error_guest_mode: 5757 vmx->nested.nested_run_pending = 0; 5758 return ret; 5759 } 5760 5761 void nested_vmx_vcpu_setup(void) 5762 { 5763 if (enable_shadow_vmcs) { 5764 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 5765 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 5766 } 5767 } 5768 5769 /* 5770 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 5771 * returned for the various VMX controls MSRs when nested VMX is enabled. 5772 * The same values should also be used to verify that vmcs12 control fields are 5773 * valid during nested entry from L1 to L2. 5774 * Each of these control msrs has a low and high 32-bit half: A low bit is on 5775 * if the corresponding bit in the (32-bit) control field *must* be on, and a 5776 * bit in the high half is on if the corresponding bit in the control field 5777 * may be on. See also vmx_control_verify(). 5778 */ 5779 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, 5780 bool apicv) 5781 { 5782 /* 5783 * Note that as a general rule, the high half of the MSRs (bits in 5784 * the control fields which may be 1) should be initialized by the 5785 * intersection of the underlying hardware's MSR (i.e., features which 5786 * can be supported) and the list of features we want to expose - 5787 * because they are known to be properly supported in our code. 5788 * Also, usually, the low half of the MSRs (bits which must be 1) can 5789 * be set to 0, meaning that L1 may turn off any of these bits. The 5790 * reason is that if one of these bits is necessary, it will appear 5791 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 5792 * fields of vmcs01 and vmcs02, will turn these bits off - and 5793 * nested_vmx_exit_reflected() will not pass related exits to L1. 5794 * These rules have exceptions below. 5795 */ 5796 5797 /* pin-based controls */ 5798 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 5799 msrs->pinbased_ctls_low, 5800 msrs->pinbased_ctls_high); 5801 msrs->pinbased_ctls_low |= 5802 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5803 msrs->pinbased_ctls_high &= 5804 PIN_BASED_EXT_INTR_MASK | 5805 PIN_BASED_NMI_EXITING | 5806 PIN_BASED_VIRTUAL_NMIS | 5807 (apicv ? PIN_BASED_POSTED_INTR : 0); 5808 msrs->pinbased_ctls_high |= 5809 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 5810 PIN_BASED_VMX_PREEMPTION_TIMER; 5811 5812 /* exit controls */ 5813 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 5814 msrs->exit_ctls_low, 5815 msrs->exit_ctls_high); 5816 msrs->exit_ctls_low = 5817 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 5818 5819 msrs->exit_ctls_high &= 5820 #ifdef CONFIG_X86_64 5821 VM_EXIT_HOST_ADDR_SPACE_SIZE | 5822 #endif 5823 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 5824 msrs->exit_ctls_high |= 5825 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 5826 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 5827 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 5828 5829 /* We support free control of debug control saving. */ 5830 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 5831 5832 /* entry controls */ 5833 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 5834 msrs->entry_ctls_low, 5835 msrs->entry_ctls_high); 5836 msrs->entry_ctls_low = 5837 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 5838 msrs->entry_ctls_high &= 5839 #ifdef CONFIG_X86_64 5840 VM_ENTRY_IA32E_MODE | 5841 #endif 5842 VM_ENTRY_LOAD_IA32_PAT; 5843 msrs->entry_ctls_high |= 5844 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 5845 5846 /* We support free control of debug control loading. */ 5847 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 5848 5849 /* cpu-based controls */ 5850 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 5851 msrs->procbased_ctls_low, 5852 msrs->procbased_ctls_high); 5853 msrs->procbased_ctls_low = 5854 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5855 msrs->procbased_ctls_high &= 5856 CPU_BASED_VIRTUAL_INTR_PENDING | 5857 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 5858 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 5859 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 5860 CPU_BASED_CR3_STORE_EXITING | 5861 #ifdef CONFIG_X86_64 5862 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 5863 #endif 5864 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 5865 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 5866 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 5867 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 5868 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 5869 /* 5870 * We can allow some features even when not supported by the 5871 * hardware. For example, L1 can specify an MSR bitmap - and we 5872 * can use it to avoid exits to L1 - even when L0 runs L2 5873 * without MSR bitmaps. 5874 */ 5875 msrs->procbased_ctls_high |= 5876 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 5877 CPU_BASED_USE_MSR_BITMAPS; 5878 5879 /* We support free control of CR3 access interception. */ 5880 msrs->procbased_ctls_low &= 5881 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 5882 5883 /* 5884 * secondary cpu-based controls. Do not include those that 5885 * depend on CPUID bits, they are added later by vmx_cpuid_update. 5886 */ 5887 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 5888 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 5889 msrs->secondary_ctls_low, 5890 msrs->secondary_ctls_high); 5891 5892 msrs->secondary_ctls_low = 0; 5893 msrs->secondary_ctls_high &= 5894 SECONDARY_EXEC_DESC | 5895 SECONDARY_EXEC_RDTSCP | 5896 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 5897 SECONDARY_EXEC_WBINVD_EXITING | 5898 SECONDARY_EXEC_APIC_REGISTER_VIRT | 5899 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 5900 SECONDARY_EXEC_RDRAND_EXITING | 5901 SECONDARY_EXEC_ENABLE_INVPCID | 5902 SECONDARY_EXEC_RDSEED_EXITING | 5903 SECONDARY_EXEC_XSAVES; 5904 5905 /* 5906 * We can emulate "VMCS shadowing," even if the hardware 5907 * doesn't support it. 5908 */ 5909 msrs->secondary_ctls_high |= 5910 SECONDARY_EXEC_SHADOW_VMCS; 5911 5912 if (enable_ept) { 5913 /* nested EPT: emulate EPT also to L1 */ 5914 msrs->secondary_ctls_high |= 5915 SECONDARY_EXEC_ENABLE_EPT; 5916 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 5917 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; 5918 if (cpu_has_vmx_ept_execute_only()) 5919 msrs->ept_caps |= 5920 VMX_EPT_EXECUTE_ONLY_BIT; 5921 msrs->ept_caps &= ept_caps; 5922 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 5923 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 5924 VMX_EPT_1GB_PAGE_BIT; 5925 if (enable_ept_ad_bits) { 5926 msrs->secondary_ctls_high |= 5927 SECONDARY_EXEC_ENABLE_PML; 5928 msrs->ept_caps |= VMX_EPT_AD_BIT; 5929 } 5930 } 5931 5932 if (cpu_has_vmx_vmfunc()) { 5933 msrs->secondary_ctls_high |= 5934 SECONDARY_EXEC_ENABLE_VMFUNC; 5935 /* 5936 * Advertise EPTP switching unconditionally 5937 * since we emulate it 5938 */ 5939 if (enable_ept) 5940 msrs->vmfunc_controls = 5941 VMX_VMFUNC_EPTP_SWITCHING; 5942 } 5943 5944 /* 5945 * Old versions of KVM use the single-context version without 5946 * checking for support, so declare that it is supported even 5947 * though it is treated as global context. The alternative is 5948 * not failing the single-context invvpid, and it is worse. 5949 */ 5950 if (enable_vpid) { 5951 msrs->secondary_ctls_high |= 5952 SECONDARY_EXEC_ENABLE_VPID; 5953 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 5954 VMX_VPID_EXTENT_SUPPORTED_MASK; 5955 } 5956 5957 if (enable_unrestricted_guest) 5958 msrs->secondary_ctls_high |= 5959 SECONDARY_EXEC_UNRESTRICTED_GUEST; 5960 5961 if (flexpriority_enabled) 5962 msrs->secondary_ctls_high |= 5963 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 5964 5965 /* miscellaneous data */ 5966 rdmsr(MSR_IA32_VMX_MISC, 5967 msrs->misc_low, 5968 msrs->misc_high); 5969 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 5970 msrs->misc_low |= 5971 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 5972 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 5973 VMX_MISC_ACTIVITY_HLT; 5974 msrs->misc_high = 0; 5975 5976 /* 5977 * This MSR reports some information about VMX support. We 5978 * should return information about the VMX we emulate for the 5979 * guest, and the VMCS structure we give it - not about the 5980 * VMX support of the underlying hardware. 5981 */ 5982 msrs->basic = 5983 VMCS12_REVISION | 5984 VMX_BASIC_TRUE_CTLS | 5985 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 5986 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 5987 5988 if (cpu_has_vmx_basic_inout()) 5989 msrs->basic |= VMX_BASIC_INOUT; 5990 5991 /* 5992 * These MSRs specify bits which the guest must keep fixed on 5993 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 5994 * We picked the standard core2 setting. 5995 */ 5996 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 5997 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 5998 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 5999 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 6000 6001 /* These MSRs specify bits which the guest must keep fixed off. */ 6002 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 6003 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 6004 6005 /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 6006 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; 6007 } 6008 6009 void nested_vmx_hardware_unsetup(void) 6010 { 6011 int i; 6012 6013 if (enable_shadow_vmcs) { 6014 for (i = 0; i < VMX_BITMAP_NR; i++) 6015 free_page((unsigned long)vmx_bitmap[i]); 6016 } 6017 } 6018 6019 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 6020 { 6021 int i; 6022 6023 if (!cpu_has_vmx_shadow_vmcs()) 6024 enable_shadow_vmcs = 0; 6025 if (enable_shadow_vmcs) { 6026 for (i = 0; i < VMX_BITMAP_NR; i++) { 6027 /* 6028 * The vmx_bitmap is not tied to a VM and so should 6029 * not be charged to a memcg. 6030 */ 6031 vmx_bitmap[i] = (unsigned long *) 6032 __get_free_page(GFP_KERNEL); 6033 if (!vmx_bitmap[i]) { 6034 nested_vmx_hardware_unsetup(); 6035 return -ENOMEM; 6036 } 6037 } 6038 6039 init_vmcs_shadow_fields(); 6040 } 6041 6042 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, 6043 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 6044 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, 6045 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, 6046 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, 6047 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, 6048 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, 6049 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, 6050 exit_handlers[EXIT_REASON_VMON] = handle_vmon, 6051 exit_handlers[EXIT_REASON_INVEPT] = handle_invept, 6052 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, 6053 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, 6054 6055 kvm_x86_ops->check_nested_events = vmx_check_nested_events; 6056 kvm_x86_ops->get_nested_state = vmx_get_nested_state; 6057 kvm_x86_ops->set_nested_state = vmx_set_nested_state; 6058 kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, 6059 kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; 6060 kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; 6061 6062 return 0; 6063 } 6064