1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/frame.h> 4 #include <linux/percpu.h> 5 6 #include <asm/debugreg.h> 7 #include <asm/mmu_context.h> 8 9 #include "cpuid.h" 10 #include "hyperv.h" 11 #include "mmu.h" 12 #include "nested.h" 13 #include "trace.h" 14 #include "x86.h" 15 16 static bool __read_mostly enable_shadow_vmcs = 1; 17 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 18 19 static bool __read_mostly nested_early_check = 0; 20 module_param(nested_early_check, bool, S_IRUGO); 21 22 /* 23 * Hyper-V requires all of these, so mark them as supported even though 24 * they are just treated the same as all-context. 25 */ 26 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 27 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 28 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 29 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 30 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 31 32 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 33 34 enum { 35 VMX_VMREAD_BITMAP, 36 VMX_VMWRITE_BITMAP, 37 VMX_BITMAP_NR 38 }; 39 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 40 41 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 42 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 43 44 struct shadow_vmcs_field { 45 u16 encoding; 46 u16 offset; 47 }; 48 static struct shadow_vmcs_field shadow_read_only_fields[] = { 49 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 50 #include "vmcs_shadow_fields.h" 51 }; 52 static int max_shadow_read_only_fields = 53 ARRAY_SIZE(shadow_read_only_fields); 54 55 static struct shadow_vmcs_field shadow_read_write_fields[] = { 56 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 57 #include "vmcs_shadow_fields.h" 58 }; 59 static int max_shadow_read_write_fields = 60 ARRAY_SIZE(shadow_read_write_fields); 61 62 static void init_vmcs_shadow_fields(void) 63 { 64 int i, j; 65 66 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 67 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 68 69 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 70 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 71 u16 field = entry.encoding; 72 73 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 74 (i + 1 == max_shadow_read_only_fields || 75 shadow_read_only_fields[i + 1].encoding != field + 1)) 76 pr_err("Missing field from shadow_read_only_field %x\n", 77 field + 1); 78 79 clear_bit(field, vmx_vmread_bitmap); 80 if (field & 1) 81 #ifdef CONFIG_X86_64 82 continue; 83 #else 84 entry.offset += sizeof(u32); 85 #endif 86 shadow_read_only_fields[j++] = entry; 87 } 88 max_shadow_read_only_fields = j; 89 90 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 91 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 92 u16 field = entry.encoding; 93 94 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 95 (i + 1 == max_shadow_read_write_fields || 96 shadow_read_write_fields[i + 1].encoding != field + 1)) 97 pr_err("Missing field from shadow_read_write_field %x\n", 98 field + 1); 99 100 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 101 field <= GUEST_TR_AR_BYTES, 102 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 103 104 /* 105 * PML and the preemption timer can be emulated, but the 106 * processor cannot vmwrite to fields that don't exist 107 * on bare metal. 108 */ 109 switch (field) { 110 case GUEST_PML_INDEX: 111 if (!cpu_has_vmx_pml()) 112 continue; 113 break; 114 case VMX_PREEMPTION_TIMER_VALUE: 115 if (!cpu_has_vmx_preemption_timer()) 116 continue; 117 break; 118 case GUEST_INTR_STATUS: 119 if (!cpu_has_vmx_apicv()) 120 continue; 121 break; 122 default: 123 break; 124 } 125 126 clear_bit(field, vmx_vmwrite_bitmap); 127 clear_bit(field, vmx_vmread_bitmap); 128 if (field & 1) 129 #ifdef CONFIG_X86_64 130 continue; 131 #else 132 entry.offset += sizeof(u32); 133 #endif 134 shadow_read_write_fields[j++] = entry; 135 } 136 max_shadow_read_write_fields = j; 137 } 138 139 /* 140 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 141 * set the success or error code of an emulated VMX instruction (as specified 142 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 143 * instruction. 144 */ 145 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 146 { 147 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 148 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 149 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 150 return kvm_skip_emulated_instruction(vcpu); 151 } 152 153 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 154 { 155 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 156 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 157 X86_EFLAGS_SF | X86_EFLAGS_OF)) 158 | X86_EFLAGS_CF); 159 return kvm_skip_emulated_instruction(vcpu); 160 } 161 162 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 163 u32 vm_instruction_error) 164 { 165 struct vcpu_vmx *vmx = to_vmx(vcpu); 166 167 /* 168 * failValid writes the error number to the current VMCS, which 169 * can't be done if there isn't a current VMCS. 170 */ 171 if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) 172 return nested_vmx_failInvalid(vcpu); 173 174 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 175 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 176 X86_EFLAGS_SF | X86_EFLAGS_OF)) 177 | X86_EFLAGS_ZF); 178 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 179 /* 180 * We don't need to force a shadow sync because 181 * VM_INSTRUCTION_ERROR is not shadowed 182 */ 183 return kvm_skip_emulated_instruction(vcpu); 184 } 185 186 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 187 { 188 /* TODO: not to reset guest simply here. */ 189 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 190 pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); 191 } 192 193 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 194 { 195 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 196 vmcs_write64(VMCS_LINK_POINTER, -1ull); 197 vmx->nested.need_vmcs12_to_shadow_sync = false; 198 } 199 200 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 201 { 202 struct vcpu_vmx *vmx = to_vmx(vcpu); 203 204 if (!vmx->nested.hv_evmcs) 205 return; 206 207 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 208 vmx->nested.hv_evmcs_vmptr = -1ull; 209 vmx->nested.hv_evmcs = NULL; 210 } 211 212 /* 213 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 214 * just stops using VMX. 215 */ 216 static void free_nested(struct kvm_vcpu *vcpu) 217 { 218 struct vcpu_vmx *vmx = to_vmx(vcpu); 219 220 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 221 return; 222 223 vmx->nested.vmxon = false; 224 vmx->nested.smm.vmxon = false; 225 free_vpid(vmx->nested.vpid02); 226 vmx->nested.posted_intr_nv = -1; 227 vmx->nested.current_vmptr = -1ull; 228 if (enable_shadow_vmcs) { 229 vmx_disable_shadow_vmcs(vmx); 230 vmcs_clear(vmx->vmcs01.shadow_vmcs); 231 free_vmcs(vmx->vmcs01.shadow_vmcs); 232 vmx->vmcs01.shadow_vmcs = NULL; 233 } 234 kfree(vmx->nested.cached_vmcs12); 235 kfree(vmx->nested.cached_shadow_vmcs12); 236 /* Unpin physical memory we referred to in the vmcs02 */ 237 if (vmx->nested.apic_access_page) { 238 kvm_release_page_dirty(vmx->nested.apic_access_page); 239 vmx->nested.apic_access_page = NULL; 240 } 241 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 242 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 243 vmx->nested.pi_desc = NULL; 244 245 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 246 247 nested_release_evmcs(vcpu); 248 249 free_loaded_vmcs(&vmx->nested.vmcs02); 250 } 251 252 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 253 struct loaded_vmcs *prev) 254 { 255 struct vmcs_host_state *dest, *src; 256 257 if (unlikely(!vmx->guest_state_loaded)) 258 return; 259 260 src = &prev->host_state; 261 dest = &vmx->loaded_vmcs->host_state; 262 263 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 264 dest->ldt_sel = src->ldt_sel; 265 #ifdef CONFIG_X86_64 266 dest->ds_sel = src->ds_sel; 267 dest->es_sel = src->es_sel; 268 #endif 269 } 270 271 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 272 { 273 struct vcpu_vmx *vmx = to_vmx(vcpu); 274 struct loaded_vmcs *prev; 275 int cpu; 276 277 if (vmx->loaded_vmcs == vmcs) 278 return; 279 280 cpu = get_cpu(); 281 prev = vmx->loaded_vmcs; 282 vmx->loaded_vmcs = vmcs; 283 vmx_vcpu_load_vmcs(vcpu, cpu); 284 vmx_sync_vmcs_host_state(vmx, prev); 285 put_cpu(); 286 287 vmx_segment_cache_clear(vmx); 288 } 289 290 /* 291 * Ensure that the current vmcs of the logical processor is the 292 * vmcs01 of the vcpu before calling free_nested(). 293 */ 294 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 295 { 296 vcpu_load(vcpu); 297 vmx_leave_nested(vcpu); 298 vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01); 299 free_nested(vcpu); 300 vcpu_put(vcpu); 301 } 302 303 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 304 struct x86_exception *fault) 305 { 306 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 307 struct vcpu_vmx *vmx = to_vmx(vcpu); 308 u32 exit_reason; 309 unsigned long exit_qualification = vcpu->arch.exit_qualification; 310 311 if (vmx->nested.pml_full) { 312 exit_reason = EXIT_REASON_PML_FULL; 313 vmx->nested.pml_full = false; 314 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 315 } else if (fault->error_code & PFERR_RSVD_MASK) 316 exit_reason = EXIT_REASON_EPT_MISCONFIG; 317 else 318 exit_reason = EXIT_REASON_EPT_VIOLATION; 319 320 nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification); 321 vmcs12->guest_physical_address = fault->address; 322 } 323 324 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 325 { 326 WARN_ON(mmu_is_nested(vcpu)); 327 328 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 329 kvm_init_shadow_ept_mmu(vcpu, 330 to_vmx(vcpu)->nested.msrs.ept_caps & 331 VMX_EPT_EXECUTE_ONLY_BIT, 332 nested_ept_ad_enabled(vcpu), 333 nested_ept_get_cr3(vcpu)); 334 vcpu->arch.mmu->set_cr3 = vmx_set_cr3; 335 vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; 336 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 337 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 338 339 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 340 } 341 342 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 343 { 344 vcpu->arch.mmu = &vcpu->arch.root_mmu; 345 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 346 } 347 348 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 349 u16 error_code) 350 { 351 bool inequality, bit; 352 353 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 354 inequality = 355 (error_code & vmcs12->page_fault_error_code_mask) != 356 vmcs12->page_fault_error_code_match; 357 return inequality ^ bit; 358 } 359 360 361 /* 362 * KVM wants to inject page-faults which it got to the guest. This function 363 * checks whether in a nested guest, we need to inject them to L1 or L2. 364 */ 365 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual) 366 { 367 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 368 unsigned int nr = vcpu->arch.exception.nr; 369 bool has_payload = vcpu->arch.exception.has_payload; 370 unsigned long payload = vcpu->arch.exception.payload; 371 372 if (nr == PF_VECTOR) { 373 if (vcpu->arch.exception.nested_apf) { 374 *exit_qual = vcpu->arch.apf.nested_apf_token; 375 return 1; 376 } 377 if (nested_vmx_is_page_fault_vmexit(vmcs12, 378 vcpu->arch.exception.error_code)) { 379 *exit_qual = has_payload ? payload : vcpu->arch.cr2; 380 return 1; 381 } 382 } else if (vmcs12->exception_bitmap & (1u << nr)) { 383 if (nr == DB_VECTOR) { 384 if (!has_payload) { 385 payload = vcpu->arch.dr6; 386 payload &= ~(DR6_FIXED_1 | DR6_BT); 387 payload ^= DR6_RTM; 388 } 389 *exit_qual = payload; 390 } else 391 *exit_qual = 0; 392 return 1; 393 } 394 395 return 0; 396 } 397 398 399 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 400 struct x86_exception *fault) 401 { 402 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 403 404 WARN_ON(!is_guest_mode(vcpu)); 405 406 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) && 407 !to_vmx(vcpu)->nested.nested_run_pending) { 408 vmcs12->vm_exit_intr_error_code = fault->error_code; 409 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 410 PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | 411 INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, 412 fault->address); 413 } else { 414 kvm_inject_page_fault(vcpu, fault); 415 } 416 } 417 418 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) 419 { 420 return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); 421 } 422 423 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 424 struct vmcs12 *vmcs12) 425 { 426 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 427 return 0; 428 429 if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || 430 !page_address_valid(vcpu, vmcs12->io_bitmap_b)) 431 return -EINVAL; 432 433 return 0; 434 } 435 436 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 437 struct vmcs12 *vmcs12) 438 { 439 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 440 return 0; 441 442 if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) 443 return -EINVAL; 444 445 return 0; 446 } 447 448 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 449 struct vmcs12 *vmcs12) 450 { 451 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 452 return 0; 453 454 if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)) 455 return -EINVAL; 456 457 return 0; 458 } 459 460 /* 461 * Check if MSR is intercepted for L01 MSR bitmap. 462 */ 463 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) 464 { 465 unsigned long *msr_bitmap; 466 int f = sizeof(unsigned long); 467 468 if (!cpu_has_vmx_msr_bitmap()) 469 return true; 470 471 msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 472 473 if (msr <= 0x1fff) { 474 return !!test_bit(msr, msr_bitmap + 0x800 / f); 475 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 476 msr &= 0x1fff; 477 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 478 } 479 480 return true; 481 } 482 483 /* 484 * If a msr is allowed by L0, we should check whether it is allowed by L1. 485 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 486 */ 487 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 488 unsigned long *msr_bitmap_nested, 489 u32 msr, int type) 490 { 491 int f = sizeof(unsigned long); 492 493 /* 494 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 495 * have the write-low and read-high bitmap offsets the wrong way round. 496 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 497 */ 498 if (msr <= 0x1fff) { 499 if (type & MSR_TYPE_R && 500 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 501 /* read-low */ 502 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 503 504 if (type & MSR_TYPE_W && 505 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 506 /* write-low */ 507 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 508 509 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 510 msr &= 0x1fff; 511 if (type & MSR_TYPE_R && 512 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 513 /* read-high */ 514 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 515 516 if (type & MSR_TYPE_W && 517 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 518 /* write-high */ 519 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 520 521 } 522 } 523 524 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) { 525 int msr; 526 527 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 528 unsigned word = msr / BITS_PER_LONG; 529 530 msr_bitmap[word] = ~0; 531 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 532 } 533 } 534 535 /* 536 * Merge L0's and L1's MSR bitmap, return false to indicate that 537 * we do not use the hardware. 538 */ 539 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 540 struct vmcs12 *vmcs12) 541 { 542 int msr; 543 unsigned long *msr_bitmap_l1; 544 unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; 545 struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; 546 547 /* Nothing to do if the MSR bitmap is not in use. */ 548 if (!cpu_has_vmx_msr_bitmap() || 549 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 550 return false; 551 552 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 553 return false; 554 555 msr_bitmap_l1 = (unsigned long *)map->hva; 556 557 /* 558 * To keep the control flow simple, pay eight 8-byte writes (sixteen 559 * 4-byte writes on 32-bit systems) up front to enable intercepts for 560 * the x2APIC MSR range and selectively disable them below. 561 */ 562 enable_x2apic_msr_intercepts(msr_bitmap_l0); 563 564 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 565 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 566 /* 567 * L0 need not intercept reads for MSRs between 0x800 568 * and 0x8ff, it just lets the processor take the value 569 * from the virtual-APIC page; take those 256 bits 570 * directly from the L1 bitmap. 571 */ 572 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 573 unsigned word = msr / BITS_PER_LONG; 574 575 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 576 } 577 } 578 579 nested_vmx_disable_intercept_for_msr( 580 msr_bitmap_l1, msr_bitmap_l0, 581 X2APIC_MSR(APIC_TASKPRI), 582 MSR_TYPE_R | MSR_TYPE_W); 583 584 if (nested_cpu_has_vid(vmcs12)) { 585 nested_vmx_disable_intercept_for_msr( 586 msr_bitmap_l1, msr_bitmap_l0, 587 X2APIC_MSR(APIC_EOI), 588 MSR_TYPE_W); 589 nested_vmx_disable_intercept_for_msr( 590 msr_bitmap_l1, msr_bitmap_l0, 591 X2APIC_MSR(APIC_SELF_IPI), 592 MSR_TYPE_W); 593 } 594 } 595 596 /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ 597 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 598 MSR_FS_BASE, MSR_TYPE_RW); 599 600 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 601 MSR_GS_BASE, MSR_TYPE_RW); 602 603 nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, 604 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 605 606 /* 607 * Checking the L0->L1 bitmap is trying to verify two things: 608 * 609 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This 610 * ensures that we do not accidentally generate an L02 MSR bitmap 611 * from the L12 MSR bitmap that is too permissive. 612 * 2. That L1 or L2s have actually used the MSR. This avoids 613 * unnecessarily merging of the bitmap if the MSR is unused. This 614 * works properly because we only update the L01 MSR bitmap lazily. 615 * So even if L0 should pass L1 these MSRs, the L01 bitmap is only 616 * updated to reflect this when L1 (or its L2s) actually write to 617 * the MSR. 618 */ 619 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) 620 nested_vmx_disable_intercept_for_msr( 621 msr_bitmap_l1, msr_bitmap_l0, 622 MSR_IA32_SPEC_CTRL, 623 MSR_TYPE_R | MSR_TYPE_W); 624 625 if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) 626 nested_vmx_disable_intercept_for_msr( 627 msr_bitmap_l1, msr_bitmap_l0, 628 MSR_IA32_PRED_CMD, 629 MSR_TYPE_W); 630 631 kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); 632 633 return true; 634 } 635 636 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 637 struct vmcs12 *vmcs12) 638 { 639 struct kvm_host_map map; 640 struct vmcs12 *shadow; 641 642 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 643 vmcs12->vmcs_link_pointer == -1ull) 644 return; 645 646 shadow = get_shadow_vmcs12(vcpu); 647 648 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 649 return; 650 651 memcpy(shadow, map.hva, VMCS12_SIZE); 652 kvm_vcpu_unmap(vcpu, &map, false); 653 } 654 655 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 656 struct vmcs12 *vmcs12) 657 { 658 struct vcpu_vmx *vmx = to_vmx(vcpu); 659 660 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 661 vmcs12->vmcs_link_pointer == -1ull) 662 return; 663 664 kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, 665 get_shadow_vmcs12(vcpu), VMCS12_SIZE); 666 } 667 668 /* 669 * In nested virtualization, check if L1 has set 670 * VM_EXIT_ACK_INTR_ON_EXIT 671 */ 672 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 673 { 674 return get_vmcs12(vcpu)->vm_exit_controls & 675 VM_EXIT_ACK_INTR_ON_EXIT; 676 } 677 678 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 679 { 680 return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu)); 681 } 682 683 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 684 struct vmcs12 *vmcs12) 685 { 686 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 687 !page_address_valid(vcpu, vmcs12->apic_access_addr)) 688 return -EINVAL; 689 else 690 return 0; 691 } 692 693 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 694 struct vmcs12 *vmcs12) 695 { 696 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 697 !nested_cpu_has_apic_reg_virt(vmcs12) && 698 !nested_cpu_has_vid(vmcs12) && 699 !nested_cpu_has_posted_intr(vmcs12)) 700 return 0; 701 702 /* 703 * If virtualize x2apic mode is enabled, 704 * virtualize apic access must be disabled. 705 */ 706 if (nested_cpu_has_virt_x2apic_mode(vmcs12) && 707 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 708 return -EINVAL; 709 710 /* 711 * If virtual interrupt delivery is enabled, 712 * we must exit on external interrupts. 713 */ 714 if (nested_cpu_has_vid(vmcs12) && 715 !nested_exit_on_intr(vcpu)) 716 return -EINVAL; 717 718 /* 719 * bits 15:8 should be zero in posted_intr_nv, 720 * the descriptor address has been already checked 721 * in nested_get_vmcs12_pages. 722 * 723 * bits 5:0 of posted_intr_desc_addr should be zero. 724 */ 725 if (nested_cpu_has_posted_intr(vmcs12) && 726 (!nested_cpu_has_vid(vmcs12) || 727 !nested_exit_intr_ack_set(vcpu) || 728 (vmcs12->posted_intr_nv & 0xff00) || 729 (vmcs12->posted_intr_desc_addr & 0x3f) || 730 (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))) 731 return -EINVAL; 732 733 /* tpr shadow is needed by all apicv features. */ 734 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 735 return -EINVAL; 736 737 return 0; 738 } 739 740 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 741 u32 count, u64 addr) 742 { 743 int maxphyaddr; 744 745 if (count == 0) 746 return 0; 747 maxphyaddr = cpuid_maxphyaddr(vcpu); 748 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 749 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) 750 return -EINVAL; 751 752 return 0; 753 } 754 755 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 756 struct vmcs12 *vmcs12) 757 { 758 if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count, 759 vmcs12->vm_exit_msr_load_addr) || 760 nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count, 761 vmcs12->vm_exit_msr_store_addr)) 762 return -EINVAL; 763 764 return 0; 765 } 766 767 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 768 struct vmcs12 *vmcs12) 769 { 770 if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count, 771 vmcs12->vm_entry_msr_load_addr)) 772 return -EINVAL; 773 774 return 0; 775 } 776 777 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 778 struct vmcs12 *vmcs12) 779 { 780 if (!nested_cpu_has_pml(vmcs12)) 781 return 0; 782 783 if (!nested_cpu_has_ept(vmcs12) || 784 !page_address_valid(vcpu, vmcs12->pml_address)) 785 return -EINVAL; 786 787 return 0; 788 } 789 790 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 791 struct vmcs12 *vmcs12) 792 { 793 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 794 !nested_cpu_has_ept(vmcs12)) 795 return -EINVAL; 796 return 0; 797 } 798 799 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 800 struct vmcs12 *vmcs12) 801 { 802 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 803 !nested_cpu_has_ept(vmcs12)) 804 return -EINVAL; 805 return 0; 806 } 807 808 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 809 struct vmcs12 *vmcs12) 810 { 811 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 812 return 0; 813 814 if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || 815 !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) 816 return -EINVAL; 817 818 return 0; 819 } 820 821 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 822 struct vmx_msr_entry *e) 823 { 824 /* x2APIC MSR accesses are not allowed */ 825 if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) 826 return -EINVAL; 827 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ 828 e->index == MSR_IA32_UCODE_REV) 829 return -EINVAL; 830 if (e->reserved != 0) 831 return -EINVAL; 832 return 0; 833 } 834 835 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 836 struct vmx_msr_entry *e) 837 { 838 if (e->index == MSR_FS_BASE || 839 e->index == MSR_GS_BASE || 840 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ 841 nested_vmx_msr_check_common(vcpu, e)) 842 return -EINVAL; 843 return 0; 844 } 845 846 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 847 struct vmx_msr_entry *e) 848 { 849 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ 850 nested_vmx_msr_check_common(vcpu, e)) 851 return -EINVAL; 852 return 0; 853 } 854 855 /* 856 * Load guest's/host's msr at nested entry/exit. 857 * return 0 for success, entry index for failure. 858 */ 859 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 860 { 861 u32 i; 862 struct vmx_msr_entry e; 863 struct msr_data msr; 864 865 msr.host_initiated = false; 866 for (i = 0; i < count; i++) { 867 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 868 &e, sizeof(e))) { 869 pr_debug_ratelimited( 870 "%s cannot read MSR entry (%u, 0x%08llx)\n", 871 __func__, i, gpa + i * sizeof(e)); 872 goto fail; 873 } 874 if (nested_vmx_load_msr_check(vcpu, &e)) { 875 pr_debug_ratelimited( 876 "%s check failed (%u, 0x%x, 0x%x)\n", 877 __func__, i, e.index, e.reserved); 878 goto fail; 879 } 880 msr.index = e.index; 881 msr.data = e.value; 882 if (kvm_set_msr(vcpu, &msr)) { 883 pr_debug_ratelimited( 884 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 885 __func__, i, e.index, e.value); 886 goto fail; 887 } 888 } 889 return 0; 890 fail: 891 return i + 1; 892 } 893 894 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 895 { 896 u32 i; 897 struct vmx_msr_entry e; 898 899 for (i = 0; i < count; i++) { 900 struct msr_data msr_info; 901 if (kvm_vcpu_read_guest(vcpu, 902 gpa + i * sizeof(e), 903 &e, 2 * sizeof(u32))) { 904 pr_debug_ratelimited( 905 "%s cannot read MSR entry (%u, 0x%08llx)\n", 906 __func__, i, gpa + i * sizeof(e)); 907 return -EINVAL; 908 } 909 if (nested_vmx_store_msr_check(vcpu, &e)) { 910 pr_debug_ratelimited( 911 "%s check failed (%u, 0x%x, 0x%x)\n", 912 __func__, i, e.index, e.reserved); 913 return -EINVAL; 914 } 915 msr_info.host_initiated = false; 916 msr_info.index = e.index; 917 if (kvm_get_msr(vcpu, &msr_info)) { 918 pr_debug_ratelimited( 919 "%s cannot read MSR (%u, 0x%x)\n", 920 __func__, i, e.index); 921 return -EINVAL; 922 } 923 if (kvm_vcpu_write_guest(vcpu, 924 gpa + i * sizeof(e) + 925 offsetof(struct vmx_msr_entry, value), 926 &msr_info.data, sizeof(msr_info.data))) { 927 pr_debug_ratelimited( 928 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 929 __func__, i, e.index, msr_info.data); 930 return -EINVAL; 931 } 932 } 933 return 0; 934 } 935 936 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) 937 { 938 unsigned long invalid_mask; 939 940 invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); 941 return (val & invalid_mask) == 0; 942 } 943 944 /* 945 * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are 946 * emulating VM entry into a guest with EPT enabled. 947 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 948 * is assigned to entry_failure_code on failure. 949 */ 950 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, 951 u32 *entry_failure_code) 952 { 953 if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { 954 if (!nested_cr3_valid(vcpu, cr3)) { 955 *entry_failure_code = ENTRY_FAIL_DEFAULT; 956 return -EINVAL; 957 } 958 959 /* 960 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 961 * must not be dereferenced. 962 */ 963 if (is_pae_paging(vcpu) && !nested_ept) { 964 if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { 965 *entry_failure_code = ENTRY_FAIL_PDPTE; 966 return -EINVAL; 967 } 968 } 969 } 970 971 if (!nested_ept) 972 kvm_mmu_new_cr3(vcpu, cr3, false); 973 974 vcpu->arch.cr3 = cr3; 975 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 976 977 kvm_init_mmu(vcpu, false); 978 979 return 0; 980 } 981 982 /* 983 * Returns if KVM is able to config CPU to tag TLB entries 984 * populated by L2 differently than TLB entries populated 985 * by L1. 986 * 987 * If L1 uses EPT, then TLB entries are tagged with different EPTP. 988 * 989 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 990 * with different VPID (L1 entries are tagged with vmx->vpid 991 * while L2 entries are tagged with vmx->nested.vpid02). 992 */ 993 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 994 { 995 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 996 997 return nested_cpu_has_ept(vmcs12) || 998 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 999 } 1000 1001 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu) 1002 { 1003 struct vcpu_vmx *vmx = to_vmx(vcpu); 1004 1005 return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid; 1006 } 1007 1008 1009 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 1010 { 1011 return fixed_bits_valid(control, low, high); 1012 } 1013 1014 static inline u64 vmx_control_msr(u32 low, u32 high) 1015 { 1016 return low | ((u64)high << 32); 1017 } 1018 1019 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1020 { 1021 superset &= mask; 1022 subset &= mask; 1023 1024 return (superset | subset) == superset; 1025 } 1026 1027 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1028 { 1029 const u64 feature_and_reserved = 1030 /* feature (except bit 48; see below) */ 1031 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1032 /* reserved */ 1033 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1034 u64 vmx_basic = vmx->nested.msrs.basic; 1035 1036 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1037 return -EINVAL; 1038 1039 /* 1040 * KVM does not emulate a version of VMX that constrains physical 1041 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1042 */ 1043 if (data & BIT_ULL(48)) 1044 return -EINVAL; 1045 1046 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1047 vmx_basic_vmcs_revision_id(data)) 1048 return -EINVAL; 1049 1050 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1051 return -EINVAL; 1052 1053 vmx->nested.msrs.basic = data; 1054 return 0; 1055 } 1056 1057 static int 1058 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1059 { 1060 u64 supported; 1061 u32 *lowp, *highp; 1062 1063 switch (msr_index) { 1064 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1065 lowp = &vmx->nested.msrs.pinbased_ctls_low; 1066 highp = &vmx->nested.msrs.pinbased_ctls_high; 1067 break; 1068 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1069 lowp = &vmx->nested.msrs.procbased_ctls_low; 1070 highp = &vmx->nested.msrs.procbased_ctls_high; 1071 break; 1072 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1073 lowp = &vmx->nested.msrs.exit_ctls_low; 1074 highp = &vmx->nested.msrs.exit_ctls_high; 1075 break; 1076 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1077 lowp = &vmx->nested.msrs.entry_ctls_low; 1078 highp = &vmx->nested.msrs.entry_ctls_high; 1079 break; 1080 case MSR_IA32_VMX_PROCBASED_CTLS2: 1081 lowp = &vmx->nested.msrs.secondary_ctls_low; 1082 highp = &vmx->nested.msrs.secondary_ctls_high; 1083 break; 1084 default: 1085 BUG(); 1086 } 1087 1088 supported = vmx_control_msr(*lowp, *highp); 1089 1090 /* Check must-be-1 bits are still 1. */ 1091 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1092 return -EINVAL; 1093 1094 /* Check must-be-0 bits are still 0. */ 1095 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1096 return -EINVAL; 1097 1098 *lowp = data; 1099 *highp = data >> 32; 1100 return 0; 1101 } 1102 1103 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1104 { 1105 const u64 feature_and_reserved_bits = 1106 /* feature */ 1107 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1108 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1109 /* reserved */ 1110 GENMASK_ULL(13, 9) | BIT_ULL(31); 1111 u64 vmx_misc; 1112 1113 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 1114 vmx->nested.msrs.misc_high); 1115 1116 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1117 return -EINVAL; 1118 1119 if ((vmx->nested.msrs.pinbased_ctls_high & 1120 PIN_BASED_VMX_PREEMPTION_TIMER) && 1121 vmx_misc_preemption_timer_rate(data) != 1122 vmx_misc_preemption_timer_rate(vmx_misc)) 1123 return -EINVAL; 1124 1125 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1126 return -EINVAL; 1127 1128 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1129 return -EINVAL; 1130 1131 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1132 return -EINVAL; 1133 1134 vmx->nested.msrs.misc_low = data; 1135 vmx->nested.msrs.misc_high = data >> 32; 1136 1137 return 0; 1138 } 1139 1140 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1141 { 1142 u64 vmx_ept_vpid_cap; 1143 1144 vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps, 1145 vmx->nested.msrs.vpid_caps); 1146 1147 /* Every bit is either reserved or a feature bit. */ 1148 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1149 return -EINVAL; 1150 1151 vmx->nested.msrs.ept_caps = data; 1152 vmx->nested.msrs.vpid_caps = data >> 32; 1153 return 0; 1154 } 1155 1156 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1157 { 1158 u64 *msr; 1159 1160 switch (msr_index) { 1161 case MSR_IA32_VMX_CR0_FIXED0: 1162 msr = &vmx->nested.msrs.cr0_fixed0; 1163 break; 1164 case MSR_IA32_VMX_CR4_FIXED0: 1165 msr = &vmx->nested.msrs.cr4_fixed0; 1166 break; 1167 default: 1168 BUG(); 1169 } 1170 1171 /* 1172 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1173 * must be 1 in the restored value. 1174 */ 1175 if (!is_bitwise_subset(data, *msr, -1ULL)) 1176 return -EINVAL; 1177 1178 *msr = data; 1179 return 0; 1180 } 1181 1182 /* 1183 * Called when userspace is restoring VMX MSRs. 1184 * 1185 * Returns 0 on success, non-0 otherwise. 1186 */ 1187 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1188 { 1189 struct vcpu_vmx *vmx = to_vmx(vcpu); 1190 1191 /* 1192 * Don't allow changes to the VMX capability MSRs while the vCPU 1193 * is in VMX operation. 1194 */ 1195 if (vmx->nested.vmxon) 1196 return -EBUSY; 1197 1198 switch (msr_index) { 1199 case MSR_IA32_VMX_BASIC: 1200 return vmx_restore_vmx_basic(vmx, data); 1201 case MSR_IA32_VMX_PINBASED_CTLS: 1202 case MSR_IA32_VMX_PROCBASED_CTLS: 1203 case MSR_IA32_VMX_EXIT_CTLS: 1204 case MSR_IA32_VMX_ENTRY_CTLS: 1205 /* 1206 * The "non-true" VMX capability MSRs are generated from the 1207 * "true" MSRs, so we do not support restoring them directly. 1208 * 1209 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1210 * should restore the "true" MSRs with the must-be-1 bits 1211 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1212 * DEFAULT SETTINGS". 1213 */ 1214 return -EINVAL; 1215 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1216 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1217 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1218 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1219 case MSR_IA32_VMX_PROCBASED_CTLS2: 1220 return vmx_restore_control_msr(vmx, msr_index, data); 1221 case MSR_IA32_VMX_MISC: 1222 return vmx_restore_vmx_misc(vmx, data); 1223 case MSR_IA32_VMX_CR0_FIXED0: 1224 case MSR_IA32_VMX_CR4_FIXED0: 1225 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1226 case MSR_IA32_VMX_CR0_FIXED1: 1227 case MSR_IA32_VMX_CR4_FIXED1: 1228 /* 1229 * These MSRs are generated based on the vCPU's CPUID, so we 1230 * do not support restoring them directly. 1231 */ 1232 return -EINVAL; 1233 case MSR_IA32_VMX_EPT_VPID_CAP: 1234 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1235 case MSR_IA32_VMX_VMCS_ENUM: 1236 vmx->nested.msrs.vmcs_enum = data; 1237 return 0; 1238 case MSR_IA32_VMX_VMFUNC: 1239 if (data & ~vmx->nested.msrs.vmfunc_controls) 1240 return -EINVAL; 1241 vmx->nested.msrs.vmfunc_controls = data; 1242 return 0; 1243 default: 1244 /* 1245 * The rest of the VMX capability MSRs do not support restore. 1246 */ 1247 return -EINVAL; 1248 } 1249 } 1250 1251 /* Returns 0 on success, non-0 otherwise. */ 1252 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1253 { 1254 switch (msr_index) { 1255 case MSR_IA32_VMX_BASIC: 1256 *pdata = msrs->basic; 1257 break; 1258 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1259 case MSR_IA32_VMX_PINBASED_CTLS: 1260 *pdata = vmx_control_msr( 1261 msrs->pinbased_ctls_low, 1262 msrs->pinbased_ctls_high); 1263 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1264 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1265 break; 1266 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1267 case MSR_IA32_VMX_PROCBASED_CTLS: 1268 *pdata = vmx_control_msr( 1269 msrs->procbased_ctls_low, 1270 msrs->procbased_ctls_high); 1271 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1272 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1273 break; 1274 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1275 case MSR_IA32_VMX_EXIT_CTLS: 1276 *pdata = vmx_control_msr( 1277 msrs->exit_ctls_low, 1278 msrs->exit_ctls_high); 1279 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1280 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1281 break; 1282 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1283 case MSR_IA32_VMX_ENTRY_CTLS: 1284 *pdata = vmx_control_msr( 1285 msrs->entry_ctls_low, 1286 msrs->entry_ctls_high); 1287 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1288 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1289 break; 1290 case MSR_IA32_VMX_MISC: 1291 *pdata = vmx_control_msr( 1292 msrs->misc_low, 1293 msrs->misc_high); 1294 break; 1295 case MSR_IA32_VMX_CR0_FIXED0: 1296 *pdata = msrs->cr0_fixed0; 1297 break; 1298 case MSR_IA32_VMX_CR0_FIXED1: 1299 *pdata = msrs->cr0_fixed1; 1300 break; 1301 case MSR_IA32_VMX_CR4_FIXED0: 1302 *pdata = msrs->cr4_fixed0; 1303 break; 1304 case MSR_IA32_VMX_CR4_FIXED1: 1305 *pdata = msrs->cr4_fixed1; 1306 break; 1307 case MSR_IA32_VMX_VMCS_ENUM: 1308 *pdata = msrs->vmcs_enum; 1309 break; 1310 case MSR_IA32_VMX_PROCBASED_CTLS2: 1311 *pdata = vmx_control_msr( 1312 msrs->secondary_ctls_low, 1313 msrs->secondary_ctls_high); 1314 break; 1315 case MSR_IA32_VMX_EPT_VPID_CAP: 1316 *pdata = msrs->ept_caps | 1317 ((u64)msrs->vpid_caps << 32); 1318 break; 1319 case MSR_IA32_VMX_VMFUNC: 1320 *pdata = msrs->vmfunc_controls; 1321 break; 1322 default: 1323 return 1; 1324 } 1325 1326 return 0; 1327 } 1328 1329 /* 1330 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1331 * been modified by the L1 guest. Note, "writable" in this context means 1332 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1333 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1334 * VM-exit information fields (which are actually writable if the vCPU is 1335 * configured to support "VMWRITE to any supported field in the VMCS"). 1336 */ 1337 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1338 { 1339 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1340 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1341 struct shadow_vmcs_field field; 1342 unsigned long val; 1343 int i; 1344 1345 if (WARN_ON(!shadow_vmcs)) 1346 return; 1347 1348 preempt_disable(); 1349 1350 vmcs_load(shadow_vmcs); 1351 1352 for (i = 0; i < max_shadow_read_write_fields; i++) { 1353 field = shadow_read_write_fields[i]; 1354 val = __vmcs_readl(field.encoding); 1355 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1356 } 1357 1358 vmcs_clear(shadow_vmcs); 1359 vmcs_load(vmx->loaded_vmcs->vmcs); 1360 1361 preempt_enable(); 1362 } 1363 1364 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1365 { 1366 const struct shadow_vmcs_field *fields[] = { 1367 shadow_read_write_fields, 1368 shadow_read_only_fields 1369 }; 1370 const int max_fields[] = { 1371 max_shadow_read_write_fields, 1372 max_shadow_read_only_fields 1373 }; 1374 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1375 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1376 struct shadow_vmcs_field field; 1377 unsigned long val; 1378 int i, q; 1379 1380 if (WARN_ON(!shadow_vmcs)) 1381 return; 1382 1383 vmcs_load(shadow_vmcs); 1384 1385 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1386 for (i = 0; i < max_fields[q]; i++) { 1387 field = fields[q][i]; 1388 val = vmcs12_read_any(vmcs12, field.encoding, 1389 field.offset); 1390 __vmcs_writel(field.encoding, val); 1391 } 1392 } 1393 1394 vmcs_clear(shadow_vmcs); 1395 vmcs_load(vmx->loaded_vmcs->vmcs); 1396 } 1397 1398 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) 1399 { 1400 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1401 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1402 1403 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1404 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1405 vmcs12->guest_rip = evmcs->guest_rip; 1406 1407 if (unlikely(!(evmcs->hv_clean_fields & 1408 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1409 vmcs12->guest_rsp = evmcs->guest_rsp; 1410 vmcs12->guest_rflags = evmcs->guest_rflags; 1411 vmcs12->guest_interruptibility_info = 1412 evmcs->guest_interruptibility_info; 1413 } 1414 1415 if (unlikely(!(evmcs->hv_clean_fields & 1416 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1417 vmcs12->cpu_based_vm_exec_control = 1418 evmcs->cpu_based_vm_exec_control; 1419 } 1420 1421 if (unlikely(!(evmcs->hv_clean_fields & 1422 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1423 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1424 } 1425 1426 if (unlikely(!(evmcs->hv_clean_fields & 1427 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1428 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1429 } 1430 1431 if (unlikely(!(evmcs->hv_clean_fields & 1432 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1433 vmcs12->vm_entry_intr_info_field = 1434 evmcs->vm_entry_intr_info_field; 1435 vmcs12->vm_entry_exception_error_code = 1436 evmcs->vm_entry_exception_error_code; 1437 vmcs12->vm_entry_instruction_len = 1438 evmcs->vm_entry_instruction_len; 1439 } 1440 1441 if (unlikely(!(evmcs->hv_clean_fields & 1442 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1443 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1444 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1445 vmcs12->host_cr0 = evmcs->host_cr0; 1446 vmcs12->host_cr3 = evmcs->host_cr3; 1447 vmcs12->host_cr4 = evmcs->host_cr4; 1448 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1449 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1450 vmcs12->host_rip = evmcs->host_rip; 1451 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1452 vmcs12->host_es_selector = evmcs->host_es_selector; 1453 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1454 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1455 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1456 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1457 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1458 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1459 } 1460 1461 if (unlikely(!(evmcs->hv_clean_fields & 1462 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1463 vmcs12->pin_based_vm_exec_control = 1464 evmcs->pin_based_vm_exec_control; 1465 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1466 vmcs12->secondary_vm_exec_control = 1467 evmcs->secondary_vm_exec_control; 1468 } 1469 1470 if (unlikely(!(evmcs->hv_clean_fields & 1471 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1472 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1473 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1474 } 1475 1476 if (unlikely(!(evmcs->hv_clean_fields & 1477 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1478 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1479 } 1480 1481 if (unlikely(!(evmcs->hv_clean_fields & 1482 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1483 vmcs12->guest_es_base = evmcs->guest_es_base; 1484 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1485 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1486 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1487 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1488 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1489 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1490 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1491 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1492 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1493 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1494 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1495 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1496 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1497 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1498 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1499 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1500 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1501 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1502 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1503 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1504 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1505 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1506 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1507 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1508 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1509 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1510 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1511 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1512 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1513 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1514 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1515 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1516 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1517 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1518 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1519 } 1520 1521 if (unlikely(!(evmcs->hv_clean_fields & 1522 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1523 vmcs12->tsc_offset = evmcs->tsc_offset; 1524 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1525 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1526 } 1527 1528 if (unlikely(!(evmcs->hv_clean_fields & 1529 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1530 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1531 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1532 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1533 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1534 vmcs12->guest_cr0 = evmcs->guest_cr0; 1535 vmcs12->guest_cr3 = evmcs->guest_cr3; 1536 vmcs12->guest_cr4 = evmcs->guest_cr4; 1537 vmcs12->guest_dr7 = evmcs->guest_dr7; 1538 } 1539 1540 if (unlikely(!(evmcs->hv_clean_fields & 1541 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1542 vmcs12->host_fs_base = evmcs->host_fs_base; 1543 vmcs12->host_gs_base = evmcs->host_gs_base; 1544 vmcs12->host_tr_base = evmcs->host_tr_base; 1545 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1546 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1547 vmcs12->host_rsp = evmcs->host_rsp; 1548 } 1549 1550 if (unlikely(!(evmcs->hv_clean_fields & 1551 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1552 vmcs12->ept_pointer = evmcs->ept_pointer; 1553 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1554 } 1555 1556 if (unlikely(!(evmcs->hv_clean_fields & 1557 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1558 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1559 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1560 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1561 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1562 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1563 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1564 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1565 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1566 vmcs12->guest_pending_dbg_exceptions = 1567 evmcs->guest_pending_dbg_exceptions; 1568 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1569 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1570 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1571 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1572 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1573 } 1574 1575 /* 1576 * Not used? 1577 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1578 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1579 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1580 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0; 1581 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1; 1582 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2; 1583 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3; 1584 * vmcs12->page_fault_error_code_mask = 1585 * evmcs->page_fault_error_code_mask; 1586 * vmcs12->page_fault_error_code_match = 1587 * evmcs->page_fault_error_code_match; 1588 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1589 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1590 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1591 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1592 */ 1593 1594 /* 1595 * Read only fields: 1596 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1597 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1598 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1599 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1600 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1601 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1602 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1603 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1604 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1605 * vmcs12->exit_qualification = evmcs->exit_qualification; 1606 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1607 * 1608 * Not present in struct vmcs12: 1609 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1610 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1611 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1612 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1613 */ 1614 1615 return 0; 1616 } 1617 1618 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1619 { 1620 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1621 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1622 1623 /* 1624 * Should not be changed by KVM: 1625 * 1626 * evmcs->host_es_selector = vmcs12->host_es_selector; 1627 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1628 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1629 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1630 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1631 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1632 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1633 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1634 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1635 * evmcs->host_cr0 = vmcs12->host_cr0; 1636 * evmcs->host_cr3 = vmcs12->host_cr3; 1637 * evmcs->host_cr4 = vmcs12->host_cr4; 1638 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1639 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1640 * evmcs->host_rip = vmcs12->host_rip; 1641 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1642 * evmcs->host_fs_base = vmcs12->host_fs_base; 1643 * evmcs->host_gs_base = vmcs12->host_gs_base; 1644 * evmcs->host_tr_base = vmcs12->host_tr_base; 1645 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1646 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1647 * evmcs->host_rsp = vmcs12->host_rsp; 1648 * sync_vmcs02_to_vmcs12() doesn't read these: 1649 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1650 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1651 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1652 * evmcs->ept_pointer = vmcs12->ept_pointer; 1653 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1654 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1655 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1656 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1657 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0; 1658 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1; 1659 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2; 1660 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3; 1661 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1662 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1663 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1664 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1665 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1666 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1667 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1668 * evmcs->page_fault_error_code_mask = 1669 * vmcs12->page_fault_error_code_mask; 1670 * evmcs->page_fault_error_code_match = 1671 * vmcs12->page_fault_error_code_match; 1672 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1673 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1674 * evmcs->tsc_offset = vmcs12->tsc_offset; 1675 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1676 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1677 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1678 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1679 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1680 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1681 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1682 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1683 * 1684 * Not present in struct vmcs12: 1685 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1686 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1687 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1688 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1689 */ 1690 1691 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1692 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1693 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1694 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1695 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1696 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1697 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1698 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1699 1700 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1701 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1702 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1703 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1704 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1705 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1706 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1707 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1708 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1709 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1710 1711 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1712 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1713 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1714 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1715 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1716 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1717 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1718 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1719 1720 evmcs->guest_es_base = vmcs12->guest_es_base; 1721 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1722 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1723 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1724 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1725 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1726 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1727 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1728 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1729 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1730 1731 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1732 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1733 1734 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1735 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1736 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1737 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1738 1739 evmcs->guest_pending_dbg_exceptions = 1740 vmcs12->guest_pending_dbg_exceptions; 1741 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1742 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1743 1744 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1745 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1746 1747 evmcs->guest_cr0 = vmcs12->guest_cr0; 1748 evmcs->guest_cr3 = vmcs12->guest_cr3; 1749 evmcs->guest_cr4 = vmcs12->guest_cr4; 1750 evmcs->guest_dr7 = vmcs12->guest_dr7; 1751 1752 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1753 1754 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1755 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1756 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1757 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1758 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1759 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1760 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1761 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1762 1763 evmcs->exit_qualification = vmcs12->exit_qualification; 1764 1765 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1766 evmcs->guest_rsp = vmcs12->guest_rsp; 1767 evmcs->guest_rflags = vmcs12->guest_rflags; 1768 1769 evmcs->guest_interruptibility_info = 1770 vmcs12->guest_interruptibility_info; 1771 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1772 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1773 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1774 evmcs->vm_entry_exception_error_code = 1775 vmcs12->vm_entry_exception_error_code; 1776 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1777 1778 evmcs->guest_rip = vmcs12->guest_rip; 1779 1780 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1781 1782 return 0; 1783 } 1784 1785 /* 1786 * This is an equivalent of the nested hypervisor executing the vmptrld 1787 * instruction. 1788 */ 1789 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, 1790 bool from_launch) 1791 { 1792 struct vcpu_vmx *vmx = to_vmx(vcpu); 1793 bool evmcs_gpa_changed = false; 1794 u64 evmcs_gpa; 1795 1796 if (likely(!vmx->nested.enlightened_vmcs_enabled)) 1797 return 1; 1798 1799 if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) 1800 return 1; 1801 1802 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 1803 if (!vmx->nested.hv_evmcs) 1804 vmx->nested.current_vmptr = -1ull; 1805 1806 nested_release_evmcs(vcpu); 1807 1808 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 1809 &vmx->nested.hv_evmcs_map)) 1810 return 0; 1811 1812 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 1813 1814 /* 1815 * Currently, KVM only supports eVMCS version 1 1816 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 1817 * value to first u32 field of eVMCS which should specify eVMCS 1818 * VersionNumber. 1819 * 1820 * Guest should be aware of supported eVMCS versions by host by 1821 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 1822 * expected to set this CPUID leaf according to the value 1823 * returned in vmcs_version from nested_enable_evmcs(). 1824 * 1825 * However, it turns out that Microsoft Hyper-V fails to comply 1826 * to their own invented interface: When Hyper-V use eVMCS, it 1827 * just sets first u32 field of eVMCS to revision_id specified 1828 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 1829 * which is one of the supported versions specified in 1830 * CPUID.0x4000000A.EAX[0:15]. 1831 * 1832 * To overcome Hyper-V bug, we accept here either a supported 1833 * eVMCS version or VMCS12 revision_id as valid values for first 1834 * u32 field of eVMCS. 1835 */ 1836 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 1837 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 1838 nested_release_evmcs(vcpu); 1839 return 0; 1840 } 1841 1842 vmx->nested.dirty_vmcs12 = true; 1843 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 1844 1845 evmcs_gpa_changed = true; 1846 /* 1847 * Unlike normal vmcs12, enlightened vmcs12 is not fully 1848 * reloaded from guest's memory (read only fields, fields not 1849 * present in struct hv_enlightened_vmcs, ...). Make sure there 1850 * are no leftovers. 1851 */ 1852 if (from_launch) { 1853 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1854 memset(vmcs12, 0, sizeof(*vmcs12)); 1855 vmcs12->hdr.revision_id = VMCS12_REVISION; 1856 } 1857 1858 } 1859 1860 /* 1861 * Clean fields data can't de used on VMLAUNCH and when we switch 1862 * between different L2 guests as KVM keeps a single VMCS12 per L1. 1863 */ 1864 if (from_launch || evmcs_gpa_changed) 1865 vmx->nested.hv_evmcs->hv_clean_fields &= 1866 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 1867 1868 return 1; 1869 } 1870 1871 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 1872 { 1873 struct vcpu_vmx *vmx = to_vmx(vcpu); 1874 1875 /* 1876 * hv_evmcs may end up being not mapped after migration (when 1877 * L2 was running), map it here to make sure vmcs12 changes are 1878 * properly reflected. 1879 */ 1880 if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) 1881 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 1882 1883 if (vmx->nested.hv_evmcs) { 1884 copy_vmcs12_to_enlightened(vmx); 1885 /* All fields are clean */ 1886 vmx->nested.hv_evmcs->hv_clean_fields |= 1887 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 1888 } else { 1889 copy_vmcs12_to_shadow(vmx); 1890 } 1891 1892 vmx->nested.need_vmcs12_to_shadow_sync = false; 1893 } 1894 1895 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 1896 { 1897 struct vcpu_vmx *vmx = 1898 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 1899 1900 vmx->nested.preemption_timer_expired = true; 1901 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 1902 kvm_vcpu_kick(&vmx->vcpu); 1903 1904 return HRTIMER_NORESTART; 1905 } 1906 1907 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 1908 { 1909 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 1910 struct vcpu_vmx *vmx = to_vmx(vcpu); 1911 1912 /* 1913 * A timer value of zero is architecturally guaranteed to cause 1914 * a VMExit prior to executing any instructions in the guest. 1915 */ 1916 if (preemption_timeout == 0) { 1917 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 1918 return; 1919 } 1920 1921 if (vcpu->arch.virtual_tsc_khz == 0) 1922 return; 1923 1924 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 1925 preemption_timeout *= 1000000; 1926 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 1927 hrtimer_start(&vmx->nested.preemption_timer, 1928 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 1929 } 1930 1931 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 1932 { 1933 if (vmx->nested.nested_run_pending && 1934 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 1935 return vmcs12->guest_ia32_efer; 1936 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 1937 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 1938 else 1939 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 1940 } 1941 1942 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 1943 { 1944 /* 1945 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 1946 * according to L0's settings (vmcs12 is irrelevant here). Host 1947 * fields that come from L0 and are not constant, e.g. HOST_CR3, 1948 * will be set as needed prior to VMLAUNCH/VMRESUME. 1949 */ 1950 if (vmx->nested.vmcs02_initialized) 1951 return; 1952 vmx->nested.vmcs02_initialized = true; 1953 1954 /* 1955 * We don't care what the EPTP value is we just need to guarantee 1956 * it's valid so we don't get a false positive when doing early 1957 * consistency checks. 1958 */ 1959 if (enable_ept && nested_early_check) 1960 vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); 1961 1962 /* All VMFUNCs are currently emulated through L0 vmexits. */ 1963 if (cpu_has_vmx_vmfunc()) 1964 vmcs_write64(VM_FUNCTION_CONTROL, 0); 1965 1966 if (cpu_has_vmx_posted_intr()) 1967 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 1968 1969 if (cpu_has_vmx_msr_bitmap()) 1970 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 1971 1972 /* 1973 * The PML address never changes, so it is constant in vmcs02. 1974 * Conceptually we want to copy the PML index from vmcs01 here, 1975 * and then back to vmcs01 on nested vmexit. But since we flush 1976 * the log and reset GUEST_PML_INDEX on each vmexit, the PML 1977 * index is also effectively constant in vmcs02. 1978 */ 1979 if (enable_pml) { 1980 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 1981 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 1982 } 1983 1984 if (cpu_has_vmx_encls_vmexit()) 1985 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 1986 1987 /* 1988 * Set the MSR load/store lists to match L0's settings. Only the 1989 * addresses are constant (for vmcs02), the counts can change based 1990 * on L2's behavior, e.g. switching to/from long mode. 1991 */ 1992 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 1993 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 1994 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 1995 1996 vmx_set_constant_host_state(vmx); 1997 } 1998 1999 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2000 struct vmcs12 *vmcs12) 2001 { 2002 prepare_vmcs02_constant_state(vmx); 2003 2004 vmcs_write64(VMCS_LINK_POINTER, -1ull); 2005 2006 if (enable_vpid) { 2007 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2008 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2009 else 2010 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2011 } 2012 } 2013 2014 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2015 { 2016 u32 exec_control, vmcs12_exec_ctrl; 2017 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2018 2019 if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) 2020 prepare_vmcs02_early_rare(vmx, vmcs12); 2021 2022 /* 2023 * PIN CONTROLS 2024 */ 2025 exec_control = vmx_pin_based_exec_ctrl(vmx); 2026 exec_control |= (vmcs12->pin_based_vm_exec_control & 2027 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2028 2029 /* Posted interrupts setting is only taken from vmcs12. */ 2030 if (nested_cpu_has_posted_intr(vmcs12)) { 2031 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2032 vmx->nested.pi_pending = false; 2033 } else { 2034 exec_control &= ~PIN_BASED_POSTED_INTR; 2035 } 2036 pin_controls_set(vmx, exec_control); 2037 2038 /* 2039 * EXEC CONTROLS 2040 */ 2041 exec_control = vmx_exec_control(vmx); /* L0's desires */ 2042 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 2043 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 2044 exec_control &= ~CPU_BASED_TPR_SHADOW; 2045 exec_control |= vmcs12->cpu_based_vm_exec_control; 2046 2047 if (exec_control & CPU_BASED_TPR_SHADOW) 2048 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2049 #ifdef CONFIG_X86_64 2050 else 2051 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2052 CPU_BASED_CR8_STORE_EXITING; 2053 #endif 2054 2055 /* 2056 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2057 * for I/O port accesses. 2058 */ 2059 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2060 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2061 2062 /* 2063 * This bit will be computed in nested_get_vmcs12_pages, because 2064 * we do not have access to L1's MSR bitmap yet. For now, keep 2065 * the same bit as before, hoping to avoid multiple VMWRITEs that 2066 * only set/clear this bit. 2067 */ 2068 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2069 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2070 2071 exec_controls_set(vmx, exec_control); 2072 2073 /* 2074 * SECONDARY EXEC CONTROLS 2075 */ 2076 if (cpu_has_secondary_exec_ctrls()) { 2077 exec_control = vmx->secondary_exec_control; 2078 2079 /* Take the following fields only from vmcs12 */ 2080 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2081 SECONDARY_EXEC_ENABLE_INVPCID | 2082 SECONDARY_EXEC_RDTSCP | 2083 SECONDARY_EXEC_XSAVES | 2084 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2085 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2086 SECONDARY_EXEC_ENABLE_VMFUNC); 2087 if (nested_cpu_has(vmcs12, 2088 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) { 2089 vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control & 2090 ~SECONDARY_EXEC_ENABLE_PML; 2091 exec_control |= vmcs12_exec_ctrl; 2092 } 2093 2094 /* VMCS shadowing for L2 is emulated for now */ 2095 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2096 2097 /* 2098 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2099 * will not have to rewrite the controls just for this bit. 2100 */ 2101 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && 2102 (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2103 exec_control |= SECONDARY_EXEC_DESC; 2104 2105 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2106 vmcs_write16(GUEST_INTR_STATUS, 2107 vmcs12->guest_intr_status); 2108 2109 secondary_exec_controls_set(vmx, exec_control); 2110 } 2111 2112 /* 2113 * ENTRY CONTROLS 2114 * 2115 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2116 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2117 * on the related bits (if supported by the CPU) in the hope that 2118 * we can avoid VMWrites during vmx_set_efer(). 2119 */ 2120 exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & 2121 ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; 2122 if (cpu_has_load_ia32_efer()) { 2123 if (guest_efer & EFER_LMA) 2124 exec_control |= VM_ENTRY_IA32E_MODE; 2125 if (guest_efer != host_efer) 2126 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2127 } 2128 vm_entry_controls_set(vmx, exec_control); 2129 2130 /* 2131 * EXIT CONTROLS 2132 * 2133 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2134 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2135 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2136 */ 2137 exec_control = vmx_vmexit_ctrl(); 2138 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2139 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2140 vm_exit_controls_set(vmx, exec_control); 2141 2142 /* 2143 * Interrupt/Exception Fields 2144 */ 2145 if (vmx->nested.nested_run_pending) { 2146 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2147 vmcs12->vm_entry_intr_info_field); 2148 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2149 vmcs12->vm_entry_exception_error_code); 2150 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2151 vmcs12->vm_entry_instruction_len); 2152 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2153 vmcs12->guest_interruptibility_info); 2154 vmx->loaded_vmcs->nmi_known_unmasked = 2155 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2156 } else { 2157 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2158 } 2159 } 2160 2161 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2162 { 2163 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2164 2165 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2166 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2167 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2168 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2169 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2170 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2171 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2172 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2173 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2174 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2175 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2176 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2177 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2178 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2179 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2180 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2181 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2182 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2183 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2184 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2185 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2186 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2187 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2188 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2189 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2190 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2191 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2192 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2193 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2194 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2195 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2196 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2197 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2198 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2199 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2200 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2201 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2202 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2203 } 2204 2205 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2206 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2207 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2208 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2209 vmcs12->guest_pending_dbg_exceptions); 2210 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2211 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2212 2213 /* 2214 * L1 may access the L2's PDPTR, so save them to construct 2215 * vmcs12 2216 */ 2217 if (enable_ept) { 2218 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2219 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2220 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2221 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2222 } 2223 2224 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2225 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2226 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2227 } 2228 2229 if (nested_cpu_has_xsaves(vmcs12)) 2230 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2231 2232 /* 2233 * Whether page-faults are trapped is determined by a combination of 2234 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 2235 * If enable_ept, L0 doesn't care about page faults and we should 2236 * set all of these to L1's desires. However, if !enable_ept, L0 does 2237 * care about (at least some) page faults, and because it is not easy 2238 * (if at all possible?) to merge L0 and L1's desires, we simply ask 2239 * to exit on each and every L2 page fault. This is done by setting 2240 * MASK=MATCH=0 and (see below) EB.PF=1. 2241 * Note that below we don't need special code to set EB.PF beyond the 2242 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2243 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2244 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2245 */ 2246 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 2247 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 2248 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 2249 enable_ept ? vmcs12->page_fault_error_code_match : 0); 2250 2251 if (cpu_has_vmx_apicv()) { 2252 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2253 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2254 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2255 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2256 } 2257 2258 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2259 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2260 2261 set_cr4_guest_host_mask(vmx); 2262 } 2263 2264 /* 2265 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2266 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2267 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2268 * guest in a way that will both be appropriate to L1's requests, and our 2269 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2270 * function also has additional necessary side-effects, like setting various 2271 * vcpu->arch fields. 2272 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2273 * is assigned to entry_failure_code on failure. 2274 */ 2275 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2276 u32 *entry_failure_code) 2277 { 2278 struct vcpu_vmx *vmx = to_vmx(vcpu); 2279 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2280 bool load_guest_pdptrs_vmcs12 = false; 2281 2282 if (vmx->nested.dirty_vmcs12 || hv_evmcs) { 2283 prepare_vmcs02_rare(vmx, vmcs12); 2284 vmx->nested.dirty_vmcs12 = false; 2285 2286 load_guest_pdptrs_vmcs12 = !hv_evmcs || 2287 !(hv_evmcs->hv_clean_fields & 2288 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2289 } 2290 2291 if (vmx->nested.nested_run_pending && 2292 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2293 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2294 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2295 } else { 2296 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2297 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 2298 } 2299 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2300 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2301 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); 2302 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2303 2304 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2305 * bitwise-or of what L1 wants to trap for L2, and what we want to 2306 * trap. Note that CR0.TS also needs updating - we do this later. 2307 */ 2308 update_exception_bitmap(vcpu); 2309 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2310 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2311 2312 if (vmx->nested.nested_run_pending && 2313 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2314 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2315 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2316 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2317 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2318 } 2319 2320 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2321 2322 if (kvm_has_tsc_control) 2323 decache_tsc_multiplier(vmx); 2324 2325 if (enable_vpid) { 2326 /* 2327 * There is no direct mapping between vpid02 and vpid12, the 2328 * vpid02 is per-vCPU for L0 and reused while the value of 2329 * vpid12 is changed w/ one invvpid during nested vmentry. 2330 * The vpid12 is allocated by L1 for L2, so it will not 2331 * influence global bitmap(for vpid01 and vpid02 allocation) 2332 * even if spawn a lot of nested vCPUs. 2333 */ 2334 if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) { 2335 if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 2336 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 2337 __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false); 2338 } 2339 } else { 2340 /* 2341 * If L1 use EPT, then L0 needs to execute INVEPT on 2342 * EPTP02 instead of EPTP01. Therefore, delay TLB 2343 * flush until vmcs02->eptp is fully updated by 2344 * KVM_REQ_LOAD_CR3. Note that this assumes 2345 * KVM_REQ_TLB_FLUSH is evaluated after 2346 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). 2347 */ 2348 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2349 } 2350 } 2351 2352 if (nested_cpu_has_ept(vmcs12)) 2353 nested_ept_init_mmu_context(vcpu); 2354 else if (nested_cpu_has2(vmcs12, 2355 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2356 vmx_flush_tlb(vcpu, true); 2357 2358 /* 2359 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those 2360 * bits which we consider mandatory enabled. 2361 * The CR0_READ_SHADOW is what L2 should have expected to read given 2362 * the specifications by L1; It's not enough to take 2363 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 2364 * have more bits than L1 expected. 2365 */ 2366 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2367 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2368 2369 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2370 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2371 2372 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2373 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2374 vmx_set_efer(vcpu, vcpu->arch.efer); 2375 2376 /* 2377 * Guest state is invalid and unrestricted guest is disabled, 2378 * which means L1 attempted VMEntry to L2 with invalid state. 2379 * Fail the VMEntry. 2380 */ 2381 if (vmx->emulation_required) { 2382 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2383 return -EINVAL; 2384 } 2385 2386 /* Shadow page tables on either EPT or shadow page tables. */ 2387 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2388 entry_failure_code)) 2389 return -EINVAL; 2390 2391 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2392 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2393 is_pae_paging(vcpu)) { 2394 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2395 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2396 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2397 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2398 } 2399 2400 if (!enable_ept) 2401 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 2402 2403 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2404 kvm_rip_write(vcpu, vmcs12->guest_rip); 2405 return 0; 2406 } 2407 2408 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2409 { 2410 if (!nested_cpu_has_nmi_exiting(vmcs12) && 2411 nested_cpu_has_virtual_nmis(vmcs12)) 2412 return -EINVAL; 2413 2414 if (!nested_cpu_has_virtual_nmis(vmcs12) && 2415 nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)) 2416 return -EINVAL; 2417 2418 return 0; 2419 } 2420 2421 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) 2422 { 2423 struct vcpu_vmx *vmx = to_vmx(vcpu); 2424 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2425 2426 /* Check for memory type validity */ 2427 switch (address & VMX_EPTP_MT_MASK) { 2428 case VMX_EPTP_MT_UC: 2429 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)) 2430 return false; 2431 break; 2432 case VMX_EPTP_MT_WB: 2433 if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)) 2434 return false; 2435 break; 2436 default: 2437 return false; 2438 } 2439 2440 /* only 4 levels page-walk length are valid */ 2441 if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4) 2442 return false; 2443 2444 /* Reserved bits should not be set */ 2445 if (address >> maxphyaddr || ((address >> 7) & 0x1f)) 2446 return false; 2447 2448 /* AD, if set, should be supported */ 2449 if (address & VMX_EPTP_AD_ENABLE_BIT) { 2450 if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)) 2451 return false; 2452 } 2453 2454 return true; 2455 } 2456 2457 /* 2458 * Checks related to VM-Execution Control Fields 2459 */ 2460 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2461 struct vmcs12 *vmcs12) 2462 { 2463 struct vcpu_vmx *vmx = to_vmx(vcpu); 2464 2465 if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2466 vmx->nested.msrs.pinbased_ctls_low, 2467 vmx->nested.msrs.pinbased_ctls_high) || 2468 !vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2469 vmx->nested.msrs.procbased_ctls_low, 2470 vmx->nested.msrs.procbased_ctls_high)) 2471 return -EINVAL; 2472 2473 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2474 !vmx_control_verify(vmcs12->secondary_vm_exec_control, 2475 vmx->nested.msrs.secondary_ctls_low, 2476 vmx->nested.msrs.secondary_ctls_high)) 2477 return -EINVAL; 2478 2479 if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) || 2480 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2481 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2482 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2483 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2484 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2485 nested_vmx_check_nmi_controls(vmcs12) || 2486 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2487 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2488 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2489 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2490 (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2491 return -EINVAL; 2492 2493 if (!nested_cpu_has_preemption_timer(vmcs12) && 2494 nested_cpu_has_save_preemption_timer(vmcs12)) 2495 return -EINVAL; 2496 2497 if (nested_cpu_has_ept(vmcs12) && 2498 !valid_ept_address(vcpu, vmcs12->ept_pointer)) 2499 return -EINVAL; 2500 2501 if (nested_cpu_has_vmfunc(vmcs12)) { 2502 if (vmcs12->vm_function_control & 2503 ~vmx->nested.msrs.vmfunc_controls) 2504 return -EINVAL; 2505 2506 if (nested_cpu_has_eptp_switching(vmcs12)) { 2507 if (!nested_cpu_has_ept(vmcs12) || 2508 !page_address_valid(vcpu, vmcs12->eptp_list_address)) 2509 return -EINVAL; 2510 } 2511 } 2512 2513 return 0; 2514 } 2515 2516 /* 2517 * Checks related to VM-Exit Control Fields 2518 */ 2519 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2520 struct vmcs12 *vmcs12) 2521 { 2522 struct vcpu_vmx *vmx = to_vmx(vcpu); 2523 2524 if (!vmx_control_verify(vmcs12->vm_exit_controls, 2525 vmx->nested.msrs.exit_ctls_low, 2526 vmx->nested.msrs.exit_ctls_high) || 2527 nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)) 2528 return -EINVAL; 2529 2530 return 0; 2531 } 2532 2533 /* 2534 * Checks related to VM-Entry Control Fields 2535 */ 2536 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2537 struct vmcs12 *vmcs12) 2538 { 2539 struct vcpu_vmx *vmx = to_vmx(vcpu); 2540 2541 if (!vmx_control_verify(vmcs12->vm_entry_controls, 2542 vmx->nested.msrs.entry_ctls_low, 2543 vmx->nested.msrs.entry_ctls_high)) 2544 return -EINVAL; 2545 2546 /* 2547 * From the Intel SDM, volume 3: 2548 * Fields relevant to VM-entry event injection must be set properly. 2549 * These fields are the VM-entry interruption-information field, the 2550 * VM-entry exception error code, and the VM-entry instruction length. 2551 */ 2552 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2553 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2554 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2555 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2556 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2557 bool should_have_error_code; 2558 bool urg = nested_cpu_has2(vmcs12, 2559 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2560 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2561 2562 /* VM-entry interruption-info field: interruption type */ 2563 if (intr_type == INTR_TYPE_RESERVED || 2564 (intr_type == INTR_TYPE_OTHER_EVENT && 2565 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2566 return -EINVAL; 2567 2568 /* VM-entry interruption-info field: vector */ 2569 if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2570 (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2571 (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2572 return -EINVAL; 2573 2574 /* VM-entry interruption-info field: deliver error code */ 2575 should_have_error_code = 2576 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2577 x86_exception_has_error_code(vector); 2578 if (has_error_code != should_have_error_code) 2579 return -EINVAL; 2580 2581 /* VM-entry exception error code */ 2582 if (has_error_code && 2583 vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) 2584 return -EINVAL; 2585 2586 /* VM-entry interruption-info field: reserved bits */ 2587 if (intr_info & INTR_INFO_RESVD_BITS_MASK) 2588 return -EINVAL; 2589 2590 /* VM-entry instruction length */ 2591 switch (intr_type) { 2592 case INTR_TYPE_SOFT_EXCEPTION: 2593 case INTR_TYPE_SOFT_INTR: 2594 case INTR_TYPE_PRIV_SW_EXCEPTION: 2595 if ((vmcs12->vm_entry_instruction_len > 15) || 2596 (vmcs12->vm_entry_instruction_len == 0 && 2597 !nested_cpu_has_zero_length_injection(vcpu))) 2598 return -EINVAL; 2599 } 2600 } 2601 2602 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2603 return -EINVAL; 2604 2605 return 0; 2606 } 2607 2608 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2609 struct vmcs12 *vmcs12) 2610 { 2611 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2612 nested_check_vm_exit_controls(vcpu, vmcs12) || 2613 nested_check_vm_entry_controls(vcpu, vmcs12)) 2614 return -EINVAL; 2615 2616 return 0; 2617 } 2618 2619 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2620 struct vmcs12 *vmcs12) 2621 { 2622 bool ia32e; 2623 2624 if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || 2625 !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || 2626 !nested_cr3_valid(vcpu, vmcs12->host_cr3)) 2627 return -EINVAL; 2628 2629 if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) || 2630 is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)) 2631 return -EINVAL; 2632 2633 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2634 !kvm_pat_valid(vmcs12->host_ia32_pat)) 2635 return -EINVAL; 2636 2637 ia32e = (vmcs12->vm_exit_controls & 2638 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; 2639 2640 if (vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2641 vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2642 vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2643 vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2644 vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2645 vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2646 vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK) || 2647 vmcs12->host_cs_selector == 0 || 2648 vmcs12->host_tr_selector == 0 || 2649 (vmcs12->host_ss_selector == 0 && !ia32e)) 2650 return -EINVAL; 2651 2652 #ifdef CONFIG_X86_64 2653 if (is_noncanonical_address(vmcs12->host_fs_base, vcpu) || 2654 is_noncanonical_address(vmcs12->host_gs_base, vcpu) || 2655 is_noncanonical_address(vmcs12->host_gdtr_base, vcpu) || 2656 is_noncanonical_address(vmcs12->host_idtr_base, vcpu) || 2657 is_noncanonical_address(vmcs12->host_tr_base, vcpu)) 2658 return -EINVAL; 2659 #endif 2660 2661 /* 2662 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2663 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2664 * the values of the LMA and LME bits in the field must each be that of 2665 * the host address-space size VM-exit control. 2666 */ 2667 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2668 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || 2669 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || 2670 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) 2671 return -EINVAL; 2672 } 2673 2674 return 0; 2675 } 2676 2677 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2678 struct vmcs12 *vmcs12) 2679 { 2680 int r = 0; 2681 struct vmcs12 *shadow; 2682 struct kvm_host_map map; 2683 2684 if (vmcs12->vmcs_link_pointer == -1ull) 2685 return 0; 2686 2687 if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) 2688 return -EINVAL; 2689 2690 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)) 2691 return -EINVAL; 2692 2693 shadow = map.hva; 2694 2695 if (shadow->hdr.revision_id != VMCS12_REVISION || 2696 shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) 2697 r = -EINVAL; 2698 2699 kvm_vcpu_unmap(vcpu, &map, false); 2700 return r; 2701 } 2702 2703 /* 2704 * Checks related to Guest Non-register State 2705 */ 2706 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 2707 { 2708 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 2709 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) 2710 return -EINVAL; 2711 2712 return 0; 2713 } 2714 2715 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 2716 struct vmcs12 *vmcs12, 2717 u32 *exit_qual) 2718 { 2719 bool ia32e; 2720 2721 *exit_qual = ENTRY_FAIL_DEFAULT; 2722 2723 if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || 2724 !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) 2725 return -EINVAL; 2726 2727 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 2728 !kvm_pat_valid(vmcs12->guest_ia32_pat)) 2729 return -EINVAL; 2730 2731 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 2732 *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; 2733 return -EINVAL; 2734 } 2735 2736 /* 2737 * If the load IA32_EFER VM-entry control is 1, the following checks 2738 * are performed on the field for the IA32_EFER MSR: 2739 * - Bits reserved in the IA32_EFER MSR must be 0. 2740 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 2741 * the IA-32e mode guest VM-exit control. It must also be identical 2742 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 2743 * CR0.PG) is 1. 2744 */ 2745 if (to_vmx(vcpu)->nested.nested_run_pending && 2746 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 2747 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 2748 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || 2749 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || 2750 ((vmcs12->guest_cr0 & X86_CR0_PG) && 2751 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) 2752 return -EINVAL; 2753 } 2754 2755 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 2756 (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || 2757 (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) 2758 return -EINVAL; 2759 2760 if (nested_check_guest_non_reg_state(vmcs12)) 2761 return -EINVAL; 2762 2763 return 0; 2764 } 2765 2766 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 2767 { 2768 struct vcpu_vmx *vmx = to_vmx(vcpu); 2769 unsigned long cr3, cr4; 2770 bool vm_fail; 2771 2772 if (!nested_early_check) 2773 return 0; 2774 2775 if (vmx->msr_autoload.host.nr) 2776 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 2777 if (vmx->msr_autoload.guest.nr) 2778 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 2779 2780 preempt_disable(); 2781 2782 vmx_prepare_switch_to_guest(vcpu); 2783 2784 /* 2785 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 2786 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 2787 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. 2788 * there is no need to preserve other bits or save/restore the field. 2789 */ 2790 vmcs_writel(GUEST_RFLAGS, 0); 2791 2792 cr3 = __get_current_cr3_fast(); 2793 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 2794 vmcs_writel(HOST_CR3, cr3); 2795 vmx->loaded_vmcs->host_state.cr3 = cr3; 2796 } 2797 2798 cr4 = cr4_read_shadow(); 2799 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 2800 vmcs_writel(HOST_CR4, cr4); 2801 vmx->loaded_vmcs->host_state.cr4 = cr4; 2802 } 2803 2804 asm( 2805 "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */ 2806 "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2807 "je 1f \n\t" 2808 __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t" 2809 "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t" 2810 "1: \n\t" 2811 "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */ 2812 2813 /* Check if vmlaunch or vmresume is needed */ 2814 "cmpb $0, %c[launched](%[loaded_vmcs])\n\t" 2815 2816 /* 2817 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set 2818 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail 2819 * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the 2820 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail. 2821 */ 2822 "call vmx_vmenter\n\t" 2823 2824 CC_SET(be) 2825 : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail) 2826 : [HOST_RSP]"r"((unsigned long)HOST_RSP), 2827 [loaded_vmcs]"r"(vmx->loaded_vmcs), 2828 [launched]"i"(offsetof(struct loaded_vmcs, launched)), 2829 [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)), 2830 [wordsize]"i"(sizeof(ulong)) 2831 : "memory" 2832 ); 2833 2834 if (vmx->msr_autoload.host.nr) 2835 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2836 if (vmx->msr_autoload.guest.nr) 2837 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2838 2839 if (vm_fail) { 2840 preempt_enable(); 2841 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 2842 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 2843 return 1; 2844 } 2845 2846 /* 2847 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 2848 */ 2849 local_irq_enable(); 2850 if (hw_breakpoint_active()) 2851 set_debugreg(__this_cpu_read(cpu_dr7), 7); 2852 preempt_enable(); 2853 2854 /* 2855 * A non-failing VMEntry means we somehow entered guest mode with 2856 * an illegal RIP, and that's just the tip of the iceberg. There 2857 * is no telling what memory has been modified or what state has 2858 * been exposed to unknown code. Hitting this all but guarantees 2859 * a (very critical) hardware issue. 2860 */ 2861 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 2862 VMX_EXIT_REASONS_FAILED_VMENTRY)); 2863 2864 return 0; 2865 } 2866 2867 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 2868 struct vmcs12 *vmcs12); 2869 2870 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 2871 { 2872 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2873 struct vcpu_vmx *vmx = to_vmx(vcpu); 2874 struct kvm_host_map *map; 2875 struct page *page; 2876 u64 hpa; 2877 2878 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 2879 /* 2880 * Translate L1 physical address to host physical 2881 * address for vmcs02. Keep the page pinned, so this 2882 * physical address remains valid. We keep a reference 2883 * to it so we can release it later. 2884 */ 2885 if (vmx->nested.apic_access_page) { /* shouldn't happen */ 2886 kvm_release_page_dirty(vmx->nested.apic_access_page); 2887 vmx->nested.apic_access_page = NULL; 2888 } 2889 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); 2890 /* 2891 * If translation failed, no matter: This feature asks 2892 * to exit when accessing the given address, and if it 2893 * can never be accessed, this feature won't do 2894 * anything anyway. 2895 */ 2896 if (!is_error_page(page)) { 2897 vmx->nested.apic_access_page = page; 2898 hpa = page_to_phys(vmx->nested.apic_access_page); 2899 vmcs_write64(APIC_ACCESS_ADDR, hpa); 2900 } else { 2901 secondary_exec_controls_clearbit(vmx, 2902 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 2903 } 2904 } 2905 2906 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 2907 map = &vmx->nested.virtual_apic_map; 2908 2909 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 2910 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 2911 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 2912 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 2913 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 2914 /* 2915 * The processor will never use the TPR shadow, simply 2916 * clear the bit from the execution control. Such a 2917 * configuration is useless, but it happens in tests. 2918 * For any other configuration, failing the vm entry is 2919 * _not_ what the processor does but it's basically the 2920 * only possibility we have. 2921 */ 2922 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 2923 } else { 2924 /* 2925 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 2926 * force VM-Entry to fail. 2927 */ 2928 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); 2929 } 2930 } 2931 2932 if (nested_cpu_has_posted_intr(vmcs12)) { 2933 map = &vmx->nested.pi_desc_map; 2934 2935 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 2936 vmx->nested.pi_desc = 2937 (struct pi_desc *)(((void *)map->hva) + 2938 offset_in_page(vmcs12->posted_intr_desc_addr)); 2939 vmcs_write64(POSTED_INTR_DESC_ADDR, 2940 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 2941 } 2942 } 2943 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 2944 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 2945 else 2946 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 2947 } 2948 2949 /* 2950 * Intel's VMX Instruction Reference specifies a common set of prerequisites 2951 * for running VMX instructions (except VMXON, whose prerequisites are 2952 * slightly different). It also specifies what exception to inject otherwise. 2953 * Note that many of these exceptions have priority over VM exits, so they 2954 * don't have to be checked again here. 2955 */ 2956 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 2957 { 2958 if (!to_vmx(vcpu)->nested.vmxon) { 2959 kvm_queue_exception(vcpu, UD_VECTOR); 2960 return 0; 2961 } 2962 2963 if (vmx_get_cpl(vcpu)) { 2964 kvm_inject_gp(vcpu, 0); 2965 return 0; 2966 } 2967 2968 return 1; 2969 } 2970 2971 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 2972 { 2973 u8 rvi = vmx_get_rvi(); 2974 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 2975 2976 return ((rvi & 0xf0) > (vppr & 0xf0)); 2977 } 2978 2979 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 2980 struct vmcs12 *vmcs12); 2981 2982 /* 2983 * If from_vmentry is false, this is being called from state restore (either RSM 2984 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 2985 + * 2986 + * Returns: 2987 + * 0 - success, i.e. proceed with actual VMEnter 2988 + * 1 - consistency check VMExit 2989 + * -1 - consistency check VMFail 2990 */ 2991 int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) 2992 { 2993 struct vcpu_vmx *vmx = to_vmx(vcpu); 2994 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2995 bool evaluate_pending_interrupts; 2996 u32 exit_reason = EXIT_REASON_INVALID_STATE; 2997 u32 exit_qual; 2998 2999 evaluate_pending_interrupts = exec_controls_get(vmx) & 3000 (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); 3001 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3002 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3003 3004 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3005 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3006 if (kvm_mpx_supported() && 3007 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 3008 vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3009 3010 /* 3011 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3012 * nested early checks are disabled. In the event of a "late" VM-Fail, 3013 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3014 * software model to the pre-VMEntry host state. When EPT is disabled, 3015 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3016 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3017 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3018 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3019 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3020 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3021 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3022 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3023 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3024 * path would need to manually save/restore vmcs01.GUEST_CR3. 3025 */ 3026 if (!enable_ept && !nested_early_check) 3027 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3028 3029 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3030 3031 prepare_vmcs02_early(vmx, vmcs12); 3032 3033 if (from_vmentry) { 3034 nested_get_vmcs12_pages(vcpu); 3035 3036 if (nested_vmx_check_vmentry_hw(vcpu)) { 3037 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3038 return -1; 3039 } 3040 3041 if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 3042 goto vmentry_fail_vmexit; 3043 } 3044 3045 enter_guest_mode(vcpu); 3046 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3047 vcpu->arch.tsc_offset += vmcs12->tsc_offset; 3048 3049 if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) 3050 goto vmentry_fail_vmexit_guest_mode; 3051 3052 if (from_vmentry) { 3053 exit_reason = EXIT_REASON_MSR_LOAD_FAIL; 3054 exit_qual = nested_vmx_load_msr(vcpu, 3055 vmcs12->vm_entry_msr_load_addr, 3056 vmcs12->vm_entry_msr_load_count); 3057 if (exit_qual) 3058 goto vmentry_fail_vmexit_guest_mode; 3059 } else { 3060 /* 3061 * The MMU is not initialized to point at the right entities yet and 3062 * "get pages" would need to read data from the guest (i.e. we will 3063 * need to perform gpa to hpa translation). Request a call 3064 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3065 * have already been set at vmentry time and should not be reset. 3066 */ 3067 kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); 3068 } 3069 3070 /* 3071 * If L1 had a pending IRQ/NMI until it executed 3072 * VMLAUNCH/VMRESUME which wasn't delivered because it was 3073 * disallowed (e.g. interrupts disabled), L0 needs to 3074 * evaluate if this pending event should cause an exit from L2 3075 * to L1 or delivered directly to L2 (e.g. In case L1 don't 3076 * intercept EXTERNAL_INTERRUPT). 3077 * 3078 * Usually this would be handled by the processor noticing an 3079 * IRQ/NMI window request, or checking RVI during evaluation of 3080 * pending virtual interrupts. However, this setting was done 3081 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 3082 * to perform pending event evaluation by requesting a KVM_REQ_EVENT. 3083 */ 3084 if (unlikely(evaluate_pending_interrupts)) 3085 kvm_make_request(KVM_REQ_EVENT, vcpu); 3086 3087 /* 3088 * Do not start the preemption timer hrtimer until after we know 3089 * we are successful, so that only nested_vmx_vmexit needs to cancel 3090 * the timer. 3091 */ 3092 vmx->nested.preemption_timer_expired = false; 3093 if (nested_cpu_has_preemption_timer(vmcs12)) 3094 vmx_start_preemption_timer(vcpu); 3095 3096 /* 3097 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3098 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3099 * returned as far as L1 is concerned. It will only return (and set 3100 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3101 */ 3102 return 0; 3103 3104 /* 3105 * A failed consistency check that leads to a VMExit during L1's 3106 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3107 * 26.7 "VM-entry failures during or after loading guest state". 3108 */ 3109 vmentry_fail_vmexit_guest_mode: 3110 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 3111 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3112 leave_guest_mode(vcpu); 3113 3114 vmentry_fail_vmexit: 3115 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3116 3117 if (!from_vmentry) 3118 return 1; 3119 3120 load_vmcs12_host_state(vcpu, vmcs12); 3121 vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 3122 vmcs12->exit_qualification = exit_qual; 3123 if (enable_shadow_vmcs || vmx->nested.hv_evmcs) 3124 vmx->nested.need_vmcs12_to_shadow_sync = true; 3125 return 1; 3126 } 3127 3128 /* 3129 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3130 * for running an L2 nested guest. 3131 */ 3132 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3133 { 3134 struct vmcs12 *vmcs12; 3135 struct vcpu_vmx *vmx = to_vmx(vcpu); 3136 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3137 int ret; 3138 3139 if (!nested_vmx_check_permission(vcpu)) 3140 return 1; 3141 3142 if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) 3143 return 1; 3144 3145 if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) 3146 return nested_vmx_failInvalid(vcpu); 3147 3148 vmcs12 = get_vmcs12(vcpu); 3149 3150 /* 3151 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3152 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3153 * rather than RFLAGS.ZF, and no error number is stored to the 3154 * VM-instruction error field. 3155 */ 3156 if (vmcs12->hdr.shadow_vmcs) 3157 return nested_vmx_failInvalid(vcpu); 3158 3159 if (vmx->nested.hv_evmcs) { 3160 copy_enlightened_to_vmcs12(vmx); 3161 /* Enlightened VMCS doesn't have launch state */ 3162 vmcs12->launch_state = !launch; 3163 } else if (enable_shadow_vmcs) { 3164 copy_shadow_to_vmcs12(vmx); 3165 } 3166 3167 /* 3168 * The nested entry process starts with enforcing various prerequisites 3169 * on vmcs12 as required by the Intel SDM, and act appropriately when 3170 * they fail: As the SDM explains, some conditions should cause the 3171 * instruction to fail, while others will cause the instruction to seem 3172 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3173 * To speed up the normal (success) code path, we should avoid checking 3174 * for misconfigurations which will anyway be caught by the processor 3175 * when using the merged vmcs02. 3176 */ 3177 if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) 3178 return nested_vmx_failValid(vcpu, 3179 VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3180 3181 if (vmcs12->launch_state == launch) 3182 return nested_vmx_failValid(vcpu, 3183 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3184 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3185 3186 if (nested_vmx_check_controls(vcpu, vmcs12)) 3187 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3188 3189 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3190 return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3191 3192 /* 3193 * We're finally done with prerequisite checking, and can start with 3194 * the nested entry. 3195 */ 3196 vmx->nested.nested_run_pending = 1; 3197 ret = nested_vmx_enter_non_root_mode(vcpu, true); 3198 vmx->nested.nested_run_pending = !ret; 3199 if (ret > 0) 3200 return 1; 3201 else if (ret) 3202 return nested_vmx_failValid(vcpu, 3203 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3204 3205 /* Hide L1D cache contents from the nested guest. */ 3206 vmx->vcpu.arch.l1tf_flush_l1d = true; 3207 3208 /* 3209 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3210 * also be used as part of restoring nVMX state for 3211 * snapshot restore (migration). 3212 * 3213 * In this flow, it is assumed that vmcs12 cache was 3214 * trasferred as part of captured nVMX state and should 3215 * therefore not be read from guest memory (which may not 3216 * exist on destination host yet). 3217 */ 3218 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3219 3220 /* 3221 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3222 * awakened by event injection or by an NMI-window VM-exit or 3223 * by an interrupt-window VM-exit, halt the vcpu. 3224 */ 3225 if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && 3226 !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3227 !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) && 3228 !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) && 3229 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3230 vmx->nested.nested_run_pending = 0; 3231 return kvm_vcpu_halt(vcpu); 3232 } 3233 return 1; 3234 } 3235 3236 /* 3237 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3238 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 3239 * This function returns the new value we should put in vmcs12.guest_cr0. 3240 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3241 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3242 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3243 * didn't trap the bit, because if L1 did, so would L0). 3244 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3245 * been modified by L2, and L1 knows it. So just leave the old value of 3246 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3247 * isn't relevant, because if L0 traps this bit it can set it to anything. 3248 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3249 * changed these bits, and therefore they need to be updated, but L0 3250 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3251 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3252 */ 3253 static inline unsigned long 3254 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3255 { 3256 return 3257 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3258 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3259 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3260 vcpu->arch.cr0_guest_owned_bits)); 3261 } 3262 3263 static inline unsigned long 3264 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3265 { 3266 return 3267 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3268 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3269 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3270 vcpu->arch.cr4_guest_owned_bits)); 3271 } 3272 3273 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3274 struct vmcs12 *vmcs12) 3275 { 3276 u32 idt_vectoring; 3277 unsigned int nr; 3278 3279 if (vcpu->arch.exception.injected) { 3280 nr = vcpu->arch.exception.nr; 3281 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3282 3283 if (kvm_exception_is_soft(nr)) { 3284 vmcs12->vm_exit_instruction_len = 3285 vcpu->arch.event_exit_inst_len; 3286 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3287 } else 3288 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3289 3290 if (vcpu->arch.exception.has_error_code) { 3291 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3292 vmcs12->idt_vectoring_error_code = 3293 vcpu->arch.exception.error_code; 3294 } 3295 3296 vmcs12->idt_vectoring_info_field = idt_vectoring; 3297 } else if (vcpu->arch.nmi_injected) { 3298 vmcs12->idt_vectoring_info_field = 3299 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3300 } else if (vcpu->arch.interrupt.injected) { 3301 nr = vcpu->arch.interrupt.nr; 3302 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3303 3304 if (vcpu->arch.interrupt.soft) { 3305 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3306 vmcs12->vm_entry_instruction_len = 3307 vcpu->arch.event_exit_inst_len; 3308 } else 3309 idt_vectoring |= INTR_TYPE_EXT_INTR; 3310 3311 vmcs12->idt_vectoring_info_field = idt_vectoring; 3312 } 3313 } 3314 3315 3316 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3317 { 3318 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3319 gfn_t gfn; 3320 3321 /* 3322 * Don't need to mark the APIC access page dirty; it is never 3323 * written to by the CPU during APIC virtualization. 3324 */ 3325 3326 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3327 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3328 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3329 } 3330 3331 if (nested_cpu_has_posted_intr(vmcs12)) { 3332 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3333 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3334 } 3335 } 3336 3337 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3338 { 3339 struct vcpu_vmx *vmx = to_vmx(vcpu); 3340 int max_irr; 3341 void *vapic_page; 3342 u16 status; 3343 3344 if (!vmx->nested.pi_desc || !vmx->nested.pi_pending) 3345 return; 3346 3347 vmx->nested.pi_pending = false; 3348 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3349 return; 3350 3351 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3352 if (max_irr != 256) { 3353 vapic_page = vmx->nested.virtual_apic_map.hva; 3354 if (!vapic_page) 3355 return; 3356 3357 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3358 vapic_page, &max_irr); 3359 status = vmcs_read16(GUEST_INTR_STATUS); 3360 if ((u8)max_irr > ((u8)status & 0xff)) { 3361 status &= ~0xff; 3362 status |= (u8)max_irr; 3363 vmcs_write16(GUEST_INTR_STATUS, status); 3364 } 3365 } 3366 3367 nested_mark_vmcs12_pages_dirty(vcpu); 3368 } 3369 3370 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu, 3371 unsigned long exit_qual) 3372 { 3373 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3374 unsigned int nr = vcpu->arch.exception.nr; 3375 u32 intr_info = nr | INTR_INFO_VALID_MASK; 3376 3377 if (vcpu->arch.exception.has_error_code) { 3378 vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code; 3379 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3380 } 3381 3382 if (kvm_exception_is_soft(nr)) 3383 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3384 else 3385 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3386 3387 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3388 vmx_get_nmi_mask(vcpu)) 3389 intr_info |= INTR_INFO_UNBLOCK_NMI; 3390 3391 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3392 } 3393 3394 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 3395 { 3396 struct vcpu_vmx *vmx = to_vmx(vcpu); 3397 unsigned long exit_qual; 3398 bool block_nested_events = 3399 vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu); 3400 3401 if (vcpu->arch.exception.pending && 3402 nested_vmx_check_exception(vcpu, &exit_qual)) { 3403 if (block_nested_events) 3404 return -EBUSY; 3405 nested_vmx_inject_exception_vmexit(vcpu, exit_qual); 3406 return 0; 3407 } 3408 3409 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3410 vmx->nested.preemption_timer_expired) { 3411 if (block_nested_events) 3412 return -EBUSY; 3413 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 3414 return 0; 3415 } 3416 3417 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 3418 if (block_nested_events) 3419 return -EBUSY; 3420 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 3421 NMI_VECTOR | INTR_TYPE_NMI_INTR | 3422 INTR_INFO_VALID_MASK, 0); 3423 /* 3424 * The NMI-triggered VM exit counts as injection: 3425 * clear this one and block further NMIs. 3426 */ 3427 vcpu->arch.nmi_pending = 0; 3428 vmx_set_nmi_mask(vcpu, true); 3429 return 0; 3430 } 3431 3432 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && 3433 nested_exit_on_intr(vcpu)) { 3434 if (block_nested_events) 3435 return -EBUSY; 3436 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 3437 return 0; 3438 } 3439 3440 vmx_complete_nested_posted_interrupt(vcpu); 3441 return 0; 3442 } 3443 3444 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 3445 { 3446 ktime_t remaining = 3447 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 3448 u64 value; 3449 3450 if (ktime_to_ns(remaining) <= 0) 3451 return 0; 3452 3453 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 3454 do_div(value, 1000000); 3455 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 3456 } 3457 3458 static bool is_vmcs12_ext_field(unsigned long field) 3459 { 3460 switch (field) { 3461 case GUEST_ES_SELECTOR: 3462 case GUEST_CS_SELECTOR: 3463 case GUEST_SS_SELECTOR: 3464 case GUEST_DS_SELECTOR: 3465 case GUEST_FS_SELECTOR: 3466 case GUEST_GS_SELECTOR: 3467 case GUEST_LDTR_SELECTOR: 3468 case GUEST_TR_SELECTOR: 3469 case GUEST_ES_LIMIT: 3470 case GUEST_CS_LIMIT: 3471 case GUEST_SS_LIMIT: 3472 case GUEST_DS_LIMIT: 3473 case GUEST_FS_LIMIT: 3474 case GUEST_GS_LIMIT: 3475 case GUEST_LDTR_LIMIT: 3476 case GUEST_TR_LIMIT: 3477 case GUEST_GDTR_LIMIT: 3478 case GUEST_IDTR_LIMIT: 3479 case GUEST_ES_AR_BYTES: 3480 case GUEST_DS_AR_BYTES: 3481 case GUEST_FS_AR_BYTES: 3482 case GUEST_GS_AR_BYTES: 3483 case GUEST_LDTR_AR_BYTES: 3484 case GUEST_TR_AR_BYTES: 3485 case GUEST_ES_BASE: 3486 case GUEST_CS_BASE: 3487 case GUEST_SS_BASE: 3488 case GUEST_DS_BASE: 3489 case GUEST_FS_BASE: 3490 case GUEST_GS_BASE: 3491 case GUEST_LDTR_BASE: 3492 case GUEST_TR_BASE: 3493 case GUEST_GDTR_BASE: 3494 case GUEST_IDTR_BASE: 3495 case GUEST_PENDING_DBG_EXCEPTIONS: 3496 case GUEST_BNDCFGS: 3497 return true; 3498 default: 3499 break; 3500 } 3501 3502 return false; 3503 } 3504 3505 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3506 struct vmcs12 *vmcs12) 3507 { 3508 struct vcpu_vmx *vmx = to_vmx(vcpu); 3509 3510 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 3511 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 3512 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 3513 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 3514 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 3515 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 3516 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 3517 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 3518 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 3519 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 3520 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 3521 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 3522 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 3523 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 3524 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 3525 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 3526 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 3527 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 3528 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 3529 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 3530 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 3531 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 3532 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 3533 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 3534 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 3535 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 3536 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 3537 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 3538 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 3539 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 3540 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 3541 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 3542 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 3543 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 3544 vmcs12->guest_pending_dbg_exceptions = 3545 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 3546 if (kvm_mpx_supported()) 3547 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3548 3549 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 3550 } 3551 3552 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 3553 struct vmcs12 *vmcs12) 3554 { 3555 struct vcpu_vmx *vmx = to_vmx(vcpu); 3556 int cpu; 3557 3558 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 3559 return; 3560 3561 3562 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 3563 3564 cpu = get_cpu(); 3565 vmx->loaded_vmcs = &vmx->nested.vmcs02; 3566 vmx_vcpu_load(&vmx->vcpu, cpu); 3567 3568 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3569 3570 vmx->loaded_vmcs = &vmx->vmcs01; 3571 vmx_vcpu_load(&vmx->vcpu, cpu); 3572 put_cpu(); 3573 } 3574 3575 /* 3576 * Update the guest state fields of vmcs12 to reflect changes that 3577 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 3578 * VM-entry controls is also updated, since this is really a guest 3579 * state bit.) 3580 */ 3581 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3582 { 3583 struct vcpu_vmx *vmx = to_vmx(vcpu); 3584 3585 if (vmx->nested.hv_evmcs) 3586 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 3587 3588 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs; 3589 3590 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 3591 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 3592 3593 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 3594 vmcs12->guest_rip = kvm_rip_read(vcpu); 3595 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 3596 3597 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 3598 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 3599 3600 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 3601 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 3602 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 3603 3604 vmcs12->guest_interruptibility_info = 3605 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 3606 3607 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 3608 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 3609 else 3610 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 3611 3612 if (nested_cpu_has_preemption_timer(vmcs12) && 3613 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) 3614 vmcs12->vmx_preemption_timer_value = 3615 vmx_get_preemption_timer_value(vcpu); 3616 3617 /* 3618 * In some cases (usually, nested EPT), L2 is allowed to change its 3619 * own CR3 without exiting. If it has changed it, we must keep it. 3620 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 3621 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 3622 * 3623 * Additionally, restore L2's PDPTR to vmcs12. 3624 */ 3625 if (enable_ept) { 3626 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 3627 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3628 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 3629 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 3630 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 3631 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 3632 } 3633 } 3634 3635 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 3636 3637 if (nested_cpu_has_vid(vmcs12)) 3638 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 3639 3640 vmcs12->vm_entry_controls = 3641 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 3642 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 3643 3644 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 3645 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 3646 3647 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 3648 vmcs12->guest_ia32_efer = vcpu->arch.efer; 3649 } 3650 3651 /* 3652 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 3653 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 3654 * and this function updates it to reflect the changes to the guest state while 3655 * L2 was running (and perhaps made some exits which were handled directly by L0 3656 * without going back to L1), and to reflect the exit reason. 3657 * Note that we do not have to copy here all VMCS fields, just those that 3658 * could have changed by the L2 guest or the exit - i.e., the guest-state and 3659 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 3660 * which already writes to vmcs12 directly. 3661 */ 3662 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 3663 u32 exit_reason, u32 exit_intr_info, 3664 unsigned long exit_qualification) 3665 { 3666 /* update exit information fields: */ 3667 vmcs12->vm_exit_reason = exit_reason; 3668 vmcs12->exit_qualification = exit_qualification; 3669 vmcs12->vm_exit_intr_info = exit_intr_info; 3670 3671 vmcs12->idt_vectoring_info_field = 0; 3672 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3673 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 3674 3675 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 3676 vmcs12->launch_state = 1; 3677 3678 /* vm_entry_intr_info_field is cleared on exit. Emulate this 3679 * instead of reading the real value. */ 3680 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 3681 3682 /* 3683 * Transfer the event that L0 or L1 may wanted to inject into 3684 * L2 to IDT_VECTORING_INFO_FIELD. 3685 */ 3686 vmcs12_save_pending_event(vcpu, vmcs12); 3687 3688 /* 3689 * According to spec, there's no need to store the guest's 3690 * MSRs if the exit is due to a VM-entry failure that occurs 3691 * during or after loading the guest state. Since this exit 3692 * does not fall in that category, we need to save the MSRs. 3693 */ 3694 if (nested_vmx_store_msr(vcpu, 3695 vmcs12->vm_exit_msr_store_addr, 3696 vmcs12->vm_exit_msr_store_count)) 3697 nested_vmx_abort(vcpu, 3698 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 3699 } 3700 3701 /* 3702 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 3703 * preserved above and would only end up incorrectly in L1. 3704 */ 3705 vcpu->arch.nmi_injected = false; 3706 kvm_clear_exception_queue(vcpu); 3707 kvm_clear_interrupt_queue(vcpu); 3708 } 3709 3710 /* 3711 * A part of what we need to when the nested L2 guest exits and we want to 3712 * run its L1 parent, is to reset L1's guest state to the host state specified 3713 * in vmcs12. 3714 * This function is to be called not only on normal nested exit, but also on 3715 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 3716 * Failures During or After Loading Guest State"). 3717 * This function should be called when the active VMCS is L1's (vmcs01). 3718 */ 3719 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3720 struct vmcs12 *vmcs12) 3721 { 3722 struct kvm_segment seg; 3723 u32 entry_failure_code; 3724 3725 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 3726 vcpu->arch.efer = vmcs12->host_ia32_efer; 3727 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 3728 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 3729 else 3730 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 3731 vmx_set_efer(vcpu, vcpu->arch.efer); 3732 3733 kvm_rsp_write(vcpu, vmcs12->host_rsp); 3734 kvm_rip_write(vcpu, vmcs12->host_rip); 3735 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 3736 vmx_set_interrupt_shadow(vcpu, 0); 3737 3738 /* 3739 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 3740 * actually changed, because vmx_set_cr0 refers to efer set above. 3741 * 3742 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 3743 * (KVM doesn't change it); 3744 */ 3745 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 3746 vmx_set_cr0(vcpu, vmcs12->host_cr0); 3747 3748 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 3749 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 3750 vmx_set_cr4(vcpu, vmcs12->host_cr4); 3751 3752 nested_ept_uninit_mmu_context(vcpu); 3753 3754 /* 3755 * Only PDPTE load can fail as the value of cr3 was checked on entry and 3756 * couldn't have changed. 3757 */ 3758 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) 3759 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 3760 3761 if (!enable_ept) 3762 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 3763 3764 /* 3765 * If vmcs01 doesn't use VPID, CPU flushes TLB on every 3766 * VMEntry/VMExit. Thus, no need to flush TLB. 3767 * 3768 * If vmcs12 doesn't use VPID, L1 expects TLB to be 3769 * flushed on every VMEntry/VMExit. 3770 * 3771 * Otherwise, we can preserve TLB entries as long as we are 3772 * able to tag L1 TLB entries differently than L2 TLB entries. 3773 * 3774 * If vmcs12 uses EPT, we need to execute this flush on EPTP01 3775 * and therefore we request the TLB flush to happen only after VMCS EPTP 3776 * has been set by KVM_REQ_LOAD_CR3. 3777 */ 3778 if (enable_vpid && 3779 (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { 3780 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 3781 } 3782 3783 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 3784 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 3785 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 3786 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 3787 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 3788 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 3789 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 3790 3791 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 3792 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 3793 vmcs_write64(GUEST_BNDCFGS, 0); 3794 3795 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 3796 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 3797 vcpu->arch.pat = vmcs12->host_ia32_pat; 3798 } 3799 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 3800 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 3801 vmcs12->host_ia32_perf_global_ctrl); 3802 3803 /* Set L1 segment info according to Intel SDM 3804 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 3805 seg = (struct kvm_segment) { 3806 .base = 0, 3807 .limit = 0xFFFFFFFF, 3808 .selector = vmcs12->host_cs_selector, 3809 .type = 11, 3810 .present = 1, 3811 .s = 1, 3812 .g = 1 3813 }; 3814 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 3815 seg.l = 1; 3816 else 3817 seg.db = 1; 3818 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 3819 seg = (struct kvm_segment) { 3820 .base = 0, 3821 .limit = 0xFFFFFFFF, 3822 .type = 3, 3823 .present = 1, 3824 .s = 1, 3825 .db = 1, 3826 .g = 1 3827 }; 3828 seg.selector = vmcs12->host_ds_selector; 3829 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 3830 seg.selector = vmcs12->host_es_selector; 3831 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 3832 seg.selector = vmcs12->host_ss_selector; 3833 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 3834 seg.selector = vmcs12->host_fs_selector; 3835 seg.base = vmcs12->host_fs_base; 3836 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 3837 seg.selector = vmcs12->host_gs_selector; 3838 seg.base = vmcs12->host_gs_base; 3839 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 3840 seg = (struct kvm_segment) { 3841 .base = vmcs12->host_tr_base, 3842 .limit = 0x67, 3843 .selector = vmcs12->host_tr_selector, 3844 .type = 11, 3845 .present = 1 3846 }; 3847 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 3848 3849 kvm_set_dr(vcpu, 7, 0x400); 3850 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 3851 3852 if (cpu_has_vmx_msr_bitmap()) 3853 vmx_update_msr_bitmap(vcpu); 3854 3855 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 3856 vmcs12->vm_exit_msr_load_count)) 3857 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 3858 } 3859 3860 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 3861 { 3862 struct shared_msr_entry *efer_msr; 3863 unsigned int i; 3864 3865 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 3866 return vmcs_read64(GUEST_IA32_EFER); 3867 3868 if (cpu_has_load_ia32_efer()) 3869 return host_efer; 3870 3871 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 3872 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 3873 return vmx->msr_autoload.guest.val[i].value; 3874 } 3875 3876 efer_msr = find_msr_entry(vmx, MSR_EFER); 3877 if (efer_msr) 3878 return efer_msr->data; 3879 3880 return host_efer; 3881 } 3882 3883 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 3884 { 3885 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3886 struct vcpu_vmx *vmx = to_vmx(vcpu); 3887 struct vmx_msr_entry g, h; 3888 struct msr_data msr; 3889 gpa_t gpa; 3890 u32 i, j; 3891 3892 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 3893 3894 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 3895 /* 3896 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 3897 * as vmcs01.GUEST_DR7 contains a userspace defined value 3898 * and vcpu->arch.dr7 is not squirreled away before the 3899 * nested VMENTER (not worth adding a variable in nested_vmx). 3900 */ 3901 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 3902 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 3903 else 3904 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 3905 } 3906 3907 /* 3908 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 3909 * handle a variety of side effects to KVM's software model. 3910 */ 3911 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 3912 3913 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 3914 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 3915 3916 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 3917 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 3918 3919 nested_ept_uninit_mmu_context(vcpu); 3920 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3921 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 3922 3923 /* 3924 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 3925 * from vmcs01 (if necessary). The PDPTRs are not loaded on 3926 * VMFail, like everything else we just need to ensure our 3927 * software model is up-to-date. 3928 */ 3929 if (enable_ept) 3930 ept_save_pdptrs(vcpu); 3931 3932 kvm_mmu_reset_context(vcpu); 3933 3934 if (cpu_has_vmx_msr_bitmap()) 3935 vmx_update_msr_bitmap(vcpu); 3936 3937 /* 3938 * This nasty bit of open coding is a compromise between blindly 3939 * loading L1's MSRs using the exit load lists (incorrect emulation 3940 * of VMFail), leaving the nested VM's MSRs in the software model 3941 * (incorrect behavior) and snapshotting the modified MSRs (too 3942 * expensive since the lists are unbound by hardware). For each 3943 * MSR that was (prematurely) loaded from the nested VMEntry load 3944 * list, reload it from the exit load list if it exists and differs 3945 * from the guest value. The intent is to stuff host state as 3946 * silently as possible, not to fully process the exit load list. 3947 */ 3948 msr.host_initiated = false; 3949 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 3950 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 3951 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 3952 pr_debug_ratelimited( 3953 "%s read MSR index failed (%u, 0x%08llx)\n", 3954 __func__, i, gpa); 3955 goto vmabort; 3956 } 3957 3958 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 3959 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 3960 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 3961 pr_debug_ratelimited( 3962 "%s read MSR failed (%u, 0x%08llx)\n", 3963 __func__, j, gpa); 3964 goto vmabort; 3965 } 3966 if (h.index != g.index) 3967 continue; 3968 if (h.value == g.value) 3969 break; 3970 3971 if (nested_vmx_load_msr_check(vcpu, &h)) { 3972 pr_debug_ratelimited( 3973 "%s check failed (%u, 0x%x, 0x%x)\n", 3974 __func__, j, h.index, h.reserved); 3975 goto vmabort; 3976 } 3977 3978 msr.index = h.index; 3979 msr.data = h.value; 3980 if (kvm_set_msr(vcpu, &msr)) { 3981 pr_debug_ratelimited( 3982 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 3983 __func__, j, h.index, h.value); 3984 goto vmabort; 3985 } 3986 } 3987 } 3988 3989 return; 3990 3991 vmabort: 3992 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 3993 } 3994 3995 /* 3996 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 3997 * and modify vmcs12 to make it see what it would expect to see there if 3998 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 3999 */ 4000 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 4001 u32 exit_intr_info, unsigned long exit_qualification) 4002 { 4003 struct vcpu_vmx *vmx = to_vmx(vcpu); 4004 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4005 4006 /* trying to cancel vmlaunch/vmresume is a bug */ 4007 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4008 4009 leave_guest_mode(vcpu); 4010 4011 if (nested_cpu_has_preemption_timer(vmcs12)) 4012 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4013 4014 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 4015 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 4016 4017 if (likely(!vmx->fail)) { 4018 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4019 4020 if (exit_reason != -1) 4021 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 4022 exit_qualification); 4023 4024 /* 4025 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4026 * also be used to capture vmcs12 cache as part of 4027 * capturing nVMX state for snapshot (migration). 4028 * 4029 * Otherwise, this flush will dirty guest memory at a 4030 * point it is already assumed by user-space to be 4031 * immutable. 4032 */ 4033 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4034 } else { 4035 /* 4036 * The only expected VM-instruction error is "VM entry with 4037 * invalid control field(s)." Anything else indicates a 4038 * problem with L0. And we should never get here with a 4039 * VMFail of any type if early consistency checks are enabled. 4040 */ 4041 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4042 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4043 WARN_ON_ONCE(nested_early_check); 4044 } 4045 4046 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4047 4048 /* Update any VMCS fields that might have changed while L2 ran */ 4049 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4050 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4051 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4052 4053 if (kvm_has_tsc_control) 4054 decache_tsc_multiplier(vmx); 4055 4056 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4057 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4058 vmx_set_virtual_apic_mode(vcpu); 4059 } else if (!nested_cpu_has_ept(vmcs12) && 4060 nested_cpu_has2(vmcs12, 4061 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 4062 vmx_flush_tlb(vcpu, true); 4063 } 4064 4065 /* Unpin physical memory we referred to in vmcs02 */ 4066 if (vmx->nested.apic_access_page) { 4067 kvm_release_page_dirty(vmx->nested.apic_access_page); 4068 vmx->nested.apic_access_page = NULL; 4069 } 4070 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4071 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4072 vmx->nested.pi_desc = NULL; 4073 4074 /* 4075 * We are now running in L2, mmu_notifier will force to reload the 4076 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. 4077 */ 4078 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4079 4080 if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs)) 4081 vmx->nested.need_vmcs12_to_shadow_sync = true; 4082 4083 /* in case we halted in L2 */ 4084 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4085 4086 if (likely(!vmx->fail)) { 4087 /* 4088 * TODO: SDM says that with acknowledge interrupt on 4089 * exit, bit 31 of the VM-exit interrupt information 4090 * (valid interrupt) is always set to 1 on 4091 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't 4092 * need kvm_cpu_has_interrupt(). See the commit 4093 * message for details. 4094 */ 4095 if (nested_exit_intr_ack_set(vcpu) && 4096 exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4097 kvm_cpu_has_interrupt(vcpu)) { 4098 int irq = kvm_cpu_get_interrupt(vcpu); 4099 WARN_ON(irq < 0); 4100 vmcs12->vm_exit_intr_info = irq | 4101 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4102 } 4103 4104 if (exit_reason != -1) 4105 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4106 vmcs12->exit_qualification, 4107 vmcs12->idt_vectoring_info_field, 4108 vmcs12->vm_exit_intr_info, 4109 vmcs12->vm_exit_intr_error_code, 4110 KVM_ISA_VMX); 4111 4112 load_vmcs12_host_state(vcpu, vmcs12); 4113 4114 return; 4115 } 4116 4117 /* 4118 * After an early L2 VM-entry failure, we're now back 4119 * in L1 which thinks it just finished a VMLAUNCH or 4120 * VMRESUME instruction, so we need to set the failure 4121 * flag and the VM-instruction error field of the VMCS 4122 * accordingly, and skip the emulated instruction. 4123 */ 4124 (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4125 4126 /* 4127 * Restore L1's host state to KVM's software model. We're here 4128 * because a consistency check was caught by hardware, which 4129 * means some amount of guest state has been propagated to KVM's 4130 * model and needs to be unwound to the host's state. 4131 */ 4132 nested_vmx_restore_host_state(vcpu); 4133 4134 vmx->fail = 0; 4135 } 4136 4137 /* 4138 * Decode the memory-address operand of a vmx instruction, as recorded on an 4139 * exit caused by such an instruction (run by a guest hypervisor). 4140 * On success, returns 0. When the operand is invalid, returns 1 and throws 4141 * #UD or #GP. 4142 */ 4143 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4144 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4145 { 4146 gva_t off; 4147 bool exn; 4148 struct kvm_segment s; 4149 4150 /* 4151 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4152 * Execution", on an exit, vmx_instruction_info holds most of the 4153 * addressing components of the operand. Only the displacement part 4154 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4155 * For how an actual address is calculated from all these components, 4156 * refer to Vol. 1, "Operand Addressing". 4157 */ 4158 int scaling = vmx_instruction_info & 3; 4159 int addr_size = (vmx_instruction_info >> 7) & 7; 4160 bool is_reg = vmx_instruction_info & (1u << 10); 4161 int seg_reg = (vmx_instruction_info >> 15) & 7; 4162 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4163 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4164 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4165 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4166 4167 if (is_reg) { 4168 kvm_queue_exception(vcpu, UD_VECTOR); 4169 return 1; 4170 } 4171 4172 /* Addr = segment_base + offset */ 4173 /* offset = base + [index * scale] + displacement */ 4174 off = exit_qualification; /* holds the displacement */ 4175 if (addr_size == 1) 4176 off = (gva_t)sign_extend64(off, 31); 4177 else if (addr_size == 0) 4178 off = (gva_t)sign_extend64(off, 15); 4179 if (base_is_valid) 4180 off += kvm_register_read(vcpu, base_reg); 4181 if (index_is_valid) 4182 off += kvm_register_read(vcpu, index_reg)<<scaling; 4183 vmx_get_segment(vcpu, &s, seg_reg); 4184 4185 /* 4186 * The effective address, i.e. @off, of a memory operand is truncated 4187 * based on the address size of the instruction. Note that this is 4188 * the *effective address*, i.e. the address prior to accounting for 4189 * the segment's base. 4190 */ 4191 if (addr_size == 1) /* 32 bit */ 4192 off &= 0xffffffff; 4193 else if (addr_size == 0) /* 16 bit */ 4194 off &= 0xffff; 4195 4196 /* Checks for #GP/#SS exceptions. */ 4197 exn = false; 4198 if (is_long_mode(vcpu)) { 4199 /* 4200 * The virtual/linear address is never truncated in 64-bit 4201 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 4202 * address when using FS/GS with a non-zero base. 4203 */ 4204 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 4205 *ret = s.base + off; 4206 else 4207 *ret = off; 4208 4209 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 4210 * non-canonical form. This is the only check on the memory 4211 * destination for long mode! 4212 */ 4213 exn = is_noncanonical_address(*ret, vcpu); 4214 } else { 4215 /* 4216 * When not in long mode, the virtual/linear address is 4217 * unconditionally truncated to 32 bits regardless of the 4218 * address size. 4219 */ 4220 *ret = (s.base + off) & 0xffffffff; 4221 4222 /* Protected mode: apply checks for segment validity in the 4223 * following order: 4224 * - segment type check (#GP(0) may be thrown) 4225 * - usability check (#GP(0)/#SS(0)) 4226 * - limit check (#GP(0)/#SS(0)) 4227 */ 4228 if (wr) 4229 /* #GP(0) if the destination operand is located in a 4230 * read-only data segment or any code segment. 4231 */ 4232 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 4233 else 4234 /* #GP(0) if the source operand is located in an 4235 * execute-only code segment 4236 */ 4237 exn = ((s.type & 0xa) == 8); 4238 if (exn) { 4239 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 4240 return 1; 4241 } 4242 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 4243 */ 4244 exn = (s.unusable != 0); 4245 4246 /* 4247 * Protected mode: #GP(0)/#SS(0) if the memory operand is 4248 * outside the segment limit. All CPUs that support VMX ignore 4249 * limit checks for flat segments, i.e. segments with base==0, 4250 * limit==0xffffffff and of type expand-up data or code. 4251 */ 4252 if (!(s.base == 0 && s.limit == 0xffffffff && 4253 ((s.type & 8) || !(s.type & 4)))) 4254 exn = exn || ((u64)off + len - 1 > s.limit); 4255 } 4256 if (exn) { 4257 kvm_queue_exception_e(vcpu, 4258 seg_reg == VCPU_SREG_SS ? 4259 SS_VECTOR : GP_VECTOR, 4260 0); 4261 return 1; 4262 } 4263 4264 return 0; 4265 } 4266 4267 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) 4268 { 4269 gva_t gva; 4270 struct x86_exception e; 4271 4272 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4273 vmcs_read32(VMX_INSTRUCTION_INFO), false, 4274 sizeof(*vmpointer), &gva)) 4275 return 1; 4276 4277 if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) { 4278 kvm_inject_page_fault(vcpu, &e); 4279 return 1; 4280 } 4281 4282 return 0; 4283 } 4284 4285 /* 4286 * Allocate a shadow VMCS and associate it with the currently loaded 4287 * VMCS, unless such a shadow VMCS already exists. The newly allocated 4288 * VMCS is also VMCLEARed, so that it is ready for use. 4289 */ 4290 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 4291 { 4292 struct vcpu_vmx *vmx = to_vmx(vcpu); 4293 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 4294 4295 /* 4296 * We should allocate a shadow vmcs for vmcs01 only when L1 4297 * executes VMXON and free it when L1 executes VMXOFF. 4298 * As it is invalid to execute VMXON twice, we shouldn't reach 4299 * here when vmcs01 already have an allocated shadow vmcs. 4300 */ 4301 WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); 4302 4303 if (!loaded_vmcs->shadow_vmcs) { 4304 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 4305 if (loaded_vmcs->shadow_vmcs) 4306 vmcs_clear(loaded_vmcs->shadow_vmcs); 4307 } 4308 return loaded_vmcs->shadow_vmcs; 4309 } 4310 4311 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 4312 { 4313 struct vcpu_vmx *vmx = to_vmx(vcpu); 4314 int r; 4315 4316 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 4317 if (r < 0) 4318 goto out_vmcs02; 4319 4320 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4321 if (!vmx->nested.cached_vmcs12) 4322 goto out_cached_vmcs12; 4323 4324 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 4325 if (!vmx->nested.cached_shadow_vmcs12) 4326 goto out_cached_shadow_vmcs12; 4327 4328 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 4329 goto out_shadow_vmcs; 4330 4331 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 4332 HRTIMER_MODE_REL_PINNED); 4333 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 4334 4335 vmx->nested.vpid02 = allocate_vpid(); 4336 4337 vmx->nested.vmcs02_initialized = false; 4338 vmx->nested.vmxon = true; 4339 4340 if (pt_mode == PT_MODE_HOST_GUEST) { 4341 vmx->pt_desc.guest.ctl = 0; 4342 pt_update_intercept_for_msr(vmx); 4343 } 4344 4345 return 0; 4346 4347 out_shadow_vmcs: 4348 kfree(vmx->nested.cached_shadow_vmcs12); 4349 4350 out_cached_shadow_vmcs12: 4351 kfree(vmx->nested.cached_vmcs12); 4352 4353 out_cached_vmcs12: 4354 free_loaded_vmcs(&vmx->nested.vmcs02); 4355 4356 out_vmcs02: 4357 return -ENOMEM; 4358 } 4359 4360 /* 4361 * Emulate the VMXON instruction. 4362 * Currently, we just remember that VMX is active, and do not save or even 4363 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 4364 * do not currently need to store anything in that guest-allocated memory 4365 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 4366 * argument is different from the VMXON pointer (which the spec says they do). 4367 */ 4368 static int handle_vmon(struct kvm_vcpu *vcpu) 4369 { 4370 int ret; 4371 gpa_t vmptr; 4372 uint32_t revision; 4373 struct vcpu_vmx *vmx = to_vmx(vcpu); 4374 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 4375 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 4376 4377 /* 4378 * The Intel VMX Instruction Reference lists a bunch of bits that are 4379 * prerequisite to running VMXON, most notably cr4.VMXE must be set to 4380 * 1 (see vmx_set_cr4() for when we allow the guest to set this). 4381 * Otherwise, we should fail with #UD. But most faulting conditions 4382 * have already been checked by hardware, prior to the VM-exit for 4383 * VMXON. We do test guest cr4.VMXE because processor CR4 always has 4384 * that bit set to 1 in non-root mode. 4385 */ 4386 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { 4387 kvm_queue_exception(vcpu, UD_VECTOR); 4388 return 1; 4389 } 4390 4391 /* CPL=0 must be checked manually. */ 4392 if (vmx_get_cpl(vcpu)) { 4393 kvm_inject_gp(vcpu, 0); 4394 return 1; 4395 } 4396 4397 if (vmx->nested.vmxon) 4398 return nested_vmx_failValid(vcpu, 4399 VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 4400 4401 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 4402 != VMXON_NEEDED_FEATURES) { 4403 kvm_inject_gp(vcpu, 0); 4404 return 1; 4405 } 4406 4407 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4408 return 1; 4409 4410 /* 4411 * SDM 3: 24.11.5 4412 * The first 4 bytes of VMXON region contain the supported 4413 * VMCS revision identifier 4414 * 4415 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 4416 * which replaces physical address width with 32 4417 */ 4418 if (!page_address_valid(vcpu, vmptr)) 4419 return nested_vmx_failInvalid(vcpu); 4420 4421 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 4422 revision != VMCS12_REVISION) 4423 return nested_vmx_failInvalid(vcpu); 4424 4425 vmx->nested.vmxon_ptr = vmptr; 4426 ret = enter_vmx_operation(vcpu); 4427 if (ret) 4428 return ret; 4429 4430 return nested_vmx_succeed(vcpu); 4431 } 4432 4433 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 4434 { 4435 struct vcpu_vmx *vmx = to_vmx(vcpu); 4436 4437 if (vmx->nested.current_vmptr == -1ull) 4438 return; 4439 4440 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 4441 4442 if (enable_shadow_vmcs) { 4443 /* copy to memory all shadowed fields in case 4444 they were modified */ 4445 copy_shadow_to_vmcs12(vmx); 4446 vmx_disable_shadow_vmcs(vmx); 4447 } 4448 vmx->nested.posted_intr_nv = -1; 4449 4450 /* Flush VMCS12 to guest memory */ 4451 kvm_vcpu_write_guest_page(vcpu, 4452 vmx->nested.current_vmptr >> PAGE_SHIFT, 4453 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 4454 4455 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 4456 4457 vmx->nested.current_vmptr = -1ull; 4458 } 4459 4460 /* Emulate the VMXOFF instruction */ 4461 static int handle_vmoff(struct kvm_vcpu *vcpu) 4462 { 4463 if (!nested_vmx_check_permission(vcpu)) 4464 return 1; 4465 free_nested(vcpu); 4466 return nested_vmx_succeed(vcpu); 4467 } 4468 4469 /* Emulate the VMCLEAR instruction */ 4470 static int handle_vmclear(struct kvm_vcpu *vcpu) 4471 { 4472 struct vcpu_vmx *vmx = to_vmx(vcpu); 4473 u32 zero = 0; 4474 gpa_t vmptr; 4475 u64 evmcs_gpa; 4476 4477 if (!nested_vmx_check_permission(vcpu)) 4478 return 1; 4479 4480 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4481 return 1; 4482 4483 if (!page_address_valid(vcpu, vmptr)) 4484 return nested_vmx_failValid(vcpu, 4485 VMXERR_VMCLEAR_INVALID_ADDRESS); 4486 4487 if (vmptr == vmx->nested.vmxon_ptr) 4488 return nested_vmx_failValid(vcpu, 4489 VMXERR_VMCLEAR_VMXON_POINTER); 4490 4491 /* 4492 * When Enlightened VMEntry is enabled on the calling CPU we treat 4493 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 4494 * way to distinguish it from VMCS12) and we must not corrupt it by 4495 * writing to the non-existent 'launch_state' field. The area doesn't 4496 * have to be the currently active EVMCS on the calling CPU and there's 4497 * nothing KVM has to do to transition it from 'active' to 'non-active' 4498 * state. It is possible that the area will stay mapped as 4499 * vmx->nested.hv_evmcs but this shouldn't be a problem. 4500 */ 4501 if (likely(!vmx->nested.enlightened_vmcs_enabled || 4502 !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) { 4503 if (vmptr == vmx->nested.current_vmptr) 4504 nested_release_vmcs12(vcpu); 4505 4506 kvm_vcpu_write_guest(vcpu, 4507 vmptr + offsetof(struct vmcs12, 4508 launch_state), 4509 &zero, sizeof(zero)); 4510 } 4511 4512 return nested_vmx_succeed(vcpu); 4513 } 4514 4515 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); 4516 4517 /* Emulate the VMLAUNCH instruction */ 4518 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 4519 { 4520 return nested_vmx_run(vcpu, true); 4521 } 4522 4523 /* Emulate the VMRESUME instruction */ 4524 static int handle_vmresume(struct kvm_vcpu *vcpu) 4525 { 4526 4527 return nested_vmx_run(vcpu, false); 4528 } 4529 4530 static int handle_vmread(struct kvm_vcpu *vcpu) 4531 { 4532 unsigned long field; 4533 u64 field_value; 4534 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4535 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4536 int len; 4537 gva_t gva = 0; 4538 struct vmcs12 *vmcs12; 4539 short offset; 4540 4541 if (!nested_vmx_check_permission(vcpu)) 4542 return 1; 4543 4544 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) 4545 return nested_vmx_failInvalid(vcpu); 4546 4547 if (!is_guest_mode(vcpu)) 4548 vmcs12 = get_vmcs12(vcpu); 4549 else { 4550 /* 4551 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD 4552 * to shadowed-field sets the ALU flags for VMfailInvalid. 4553 */ 4554 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4555 return nested_vmx_failInvalid(vcpu); 4556 vmcs12 = get_shadow_vmcs12(vcpu); 4557 } 4558 4559 /* Decode instruction info and find the field to read */ 4560 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4561 4562 offset = vmcs_field_to_offset(field); 4563 if (offset < 0) 4564 return nested_vmx_failValid(vcpu, 4565 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4566 4567 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 4568 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4569 4570 /* Read the field, zero-extended to a u64 field_value */ 4571 field_value = vmcs12_read_any(vmcs12, field, offset); 4572 4573 /* 4574 * Now copy part of this value to register or memory, as requested. 4575 * Note that the number of bits actually copied is 32 or 64 depending 4576 * on the guest's mode (32 or 64 bit), not on the given field's length. 4577 */ 4578 if (vmx_instruction_info & (1u << 10)) { 4579 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 4580 field_value); 4581 } else { 4582 len = is_64_bit_mode(vcpu) ? 8 : 4; 4583 if (get_vmx_mem_address(vcpu, exit_qualification, 4584 vmx_instruction_info, true, len, &gva)) 4585 return 1; 4586 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4587 kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL); 4588 } 4589 4590 return nested_vmx_succeed(vcpu); 4591 } 4592 4593 static bool is_shadow_field_rw(unsigned long field) 4594 { 4595 switch (field) { 4596 #define SHADOW_FIELD_RW(x, y) case x: 4597 #include "vmcs_shadow_fields.h" 4598 return true; 4599 default: 4600 break; 4601 } 4602 return false; 4603 } 4604 4605 static bool is_shadow_field_ro(unsigned long field) 4606 { 4607 switch (field) { 4608 #define SHADOW_FIELD_RO(x, y) case x: 4609 #include "vmcs_shadow_fields.h" 4610 return true; 4611 default: 4612 break; 4613 } 4614 return false; 4615 } 4616 4617 static int handle_vmwrite(struct kvm_vcpu *vcpu) 4618 { 4619 unsigned long field; 4620 int len; 4621 gva_t gva; 4622 struct vcpu_vmx *vmx = to_vmx(vcpu); 4623 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4624 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4625 4626 /* The value to write might be 32 or 64 bits, depending on L1's long 4627 * mode, and eventually we need to write that into a field of several 4628 * possible lengths. The code below first zero-extends the value to 64 4629 * bit (field_value), and then copies only the appropriate number of 4630 * bits into the vmcs12 field. 4631 */ 4632 u64 field_value = 0; 4633 struct x86_exception e; 4634 struct vmcs12 *vmcs12; 4635 short offset; 4636 4637 if (!nested_vmx_check_permission(vcpu)) 4638 return 1; 4639 4640 if (vmx->nested.current_vmptr == -1ull) 4641 return nested_vmx_failInvalid(vcpu); 4642 4643 if (vmx_instruction_info & (1u << 10)) 4644 field_value = kvm_register_readl(vcpu, 4645 (((vmx_instruction_info) >> 3) & 0xf)); 4646 else { 4647 len = is_64_bit_mode(vcpu) ? 8 : 4; 4648 if (get_vmx_mem_address(vcpu, exit_qualification, 4649 vmx_instruction_info, false, len, &gva)) 4650 return 1; 4651 if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) { 4652 kvm_inject_page_fault(vcpu, &e); 4653 return 1; 4654 } 4655 } 4656 4657 4658 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 4659 /* 4660 * If the vCPU supports "VMWRITE to any supported field in the 4661 * VMCS," then the "read-only" fields are actually read/write. 4662 */ 4663 if (vmcs_field_readonly(field) && 4664 !nested_cpu_has_vmwrite_any_field(vcpu)) 4665 return nested_vmx_failValid(vcpu, 4666 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 4667 4668 if (!is_guest_mode(vcpu)) { 4669 vmcs12 = get_vmcs12(vcpu); 4670 4671 /* 4672 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 4673 * vmcs12, else we may crush a field or consume a stale value. 4674 */ 4675 if (!is_shadow_field_rw(field)) 4676 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4677 } else { 4678 /* 4679 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE 4680 * to shadowed-field sets the ALU flags for VMfailInvalid. 4681 */ 4682 if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) 4683 return nested_vmx_failInvalid(vcpu); 4684 vmcs12 = get_shadow_vmcs12(vcpu); 4685 } 4686 4687 offset = vmcs_field_to_offset(field); 4688 if (offset < 0) 4689 return nested_vmx_failValid(vcpu, 4690 VMXERR_UNSUPPORTED_VMCS_COMPONENT); 4691 4692 /* 4693 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 4694 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 4695 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 4696 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 4697 * from L1 will return a different value than VMREAD from L2 (L1 sees 4698 * the stripped down value, L2 sees the full value as stored by KVM). 4699 */ 4700 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 4701 field_value &= 0x1f0ff; 4702 4703 vmcs12_write_any(vmcs12, field, offset, field_value); 4704 4705 /* 4706 * Do not track vmcs12 dirty-state if in guest-mode as we actually 4707 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 4708 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 4709 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 4710 */ 4711 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 4712 /* 4713 * L1 can read these fields without exiting, ensure the 4714 * shadow VMCS is up-to-date. 4715 */ 4716 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 4717 preempt_disable(); 4718 vmcs_load(vmx->vmcs01.shadow_vmcs); 4719 4720 __vmcs_writel(field, field_value); 4721 4722 vmcs_clear(vmx->vmcs01.shadow_vmcs); 4723 vmcs_load(vmx->loaded_vmcs->vmcs); 4724 preempt_enable(); 4725 } 4726 vmx->nested.dirty_vmcs12 = true; 4727 } 4728 4729 return nested_vmx_succeed(vcpu); 4730 } 4731 4732 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 4733 { 4734 vmx->nested.current_vmptr = vmptr; 4735 if (enable_shadow_vmcs) { 4736 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 4737 vmcs_write64(VMCS_LINK_POINTER, 4738 __pa(vmx->vmcs01.shadow_vmcs)); 4739 vmx->nested.need_vmcs12_to_shadow_sync = true; 4740 } 4741 vmx->nested.dirty_vmcs12 = true; 4742 } 4743 4744 /* Emulate the VMPTRLD instruction */ 4745 static int handle_vmptrld(struct kvm_vcpu *vcpu) 4746 { 4747 struct vcpu_vmx *vmx = to_vmx(vcpu); 4748 gpa_t vmptr; 4749 4750 if (!nested_vmx_check_permission(vcpu)) 4751 return 1; 4752 4753 if (nested_vmx_get_vmptr(vcpu, &vmptr)) 4754 return 1; 4755 4756 if (!page_address_valid(vcpu, vmptr)) 4757 return nested_vmx_failValid(vcpu, 4758 VMXERR_VMPTRLD_INVALID_ADDRESS); 4759 4760 if (vmptr == vmx->nested.vmxon_ptr) 4761 return nested_vmx_failValid(vcpu, 4762 VMXERR_VMPTRLD_VMXON_POINTER); 4763 4764 /* Forbid normal VMPTRLD if Enlightened version was used */ 4765 if (vmx->nested.hv_evmcs) 4766 return 1; 4767 4768 if (vmx->nested.current_vmptr != vmptr) { 4769 struct kvm_host_map map; 4770 struct vmcs12 *new_vmcs12; 4771 4772 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) { 4773 /* 4774 * Reads from an unbacked page return all 1s, 4775 * which means that the 32 bits located at the 4776 * given physical address won't match the required 4777 * VMCS12_REVISION identifier. 4778 */ 4779 return nested_vmx_failValid(vcpu, 4780 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4781 } 4782 4783 new_vmcs12 = map.hva; 4784 4785 if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || 4786 (new_vmcs12->hdr.shadow_vmcs && 4787 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 4788 kvm_vcpu_unmap(vcpu, &map, false); 4789 return nested_vmx_failValid(vcpu, 4790 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 4791 } 4792 4793 nested_release_vmcs12(vcpu); 4794 4795 /* 4796 * Load VMCS12 from guest memory since it is not already 4797 * cached. 4798 */ 4799 memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE); 4800 kvm_vcpu_unmap(vcpu, &map, false); 4801 4802 set_current_vmptr(vmx, vmptr); 4803 } 4804 4805 return nested_vmx_succeed(vcpu); 4806 } 4807 4808 /* Emulate the VMPTRST instruction */ 4809 static int handle_vmptrst(struct kvm_vcpu *vcpu) 4810 { 4811 unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); 4812 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4813 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 4814 struct x86_exception e; 4815 gva_t gva; 4816 4817 if (!nested_vmx_check_permission(vcpu)) 4818 return 1; 4819 4820 if (unlikely(to_vmx(vcpu)->nested.hv_evmcs)) 4821 return 1; 4822 4823 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 4824 true, sizeof(gpa_t), &gva)) 4825 return 1; 4826 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 4827 if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 4828 sizeof(gpa_t), &e)) { 4829 kvm_inject_page_fault(vcpu, &e); 4830 return 1; 4831 } 4832 return nested_vmx_succeed(vcpu); 4833 } 4834 4835 /* Emulate the INVEPT instruction */ 4836 static int handle_invept(struct kvm_vcpu *vcpu) 4837 { 4838 struct vcpu_vmx *vmx = to_vmx(vcpu); 4839 u32 vmx_instruction_info, types; 4840 unsigned long type; 4841 gva_t gva; 4842 struct x86_exception e; 4843 struct { 4844 u64 eptp, gpa; 4845 } operand; 4846 4847 if (!(vmx->nested.msrs.secondary_ctls_high & 4848 SECONDARY_EXEC_ENABLE_EPT) || 4849 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 4850 kvm_queue_exception(vcpu, UD_VECTOR); 4851 return 1; 4852 } 4853 4854 if (!nested_vmx_check_permission(vcpu)) 4855 return 1; 4856 4857 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4858 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 4859 4860 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 4861 4862 if (type >= 32 || !(types & (1 << type))) 4863 return nested_vmx_failValid(vcpu, 4864 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4865 4866 /* According to the Intel VMX instruction reference, the memory 4867 * operand is read even if it isn't needed (e.g., for type==global) 4868 */ 4869 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4870 vmx_instruction_info, false, sizeof(operand), &gva)) 4871 return 1; 4872 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 4873 kvm_inject_page_fault(vcpu, &e); 4874 return 1; 4875 } 4876 4877 switch (type) { 4878 case VMX_EPT_EXTENT_GLOBAL: 4879 case VMX_EPT_EXTENT_CONTEXT: 4880 /* 4881 * TODO: Sync the necessary shadow EPT roots here, rather than 4882 * at the next emulated VM-entry. 4883 */ 4884 break; 4885 default: 4886 BUG_ON(1); 4887 break; 4888 } 4889 4890 return nested_vmx_succeed(vcpu); 4891 } 4892 4893 static int handle_invvpid(struct kvm_vcpu *vcpu) 4894 { 4895 struct vcpu_vmx *vmx = to_vmx(vcpu); 4896 u32 vmx_instruction_info; 4897 unsigned long type, types; 4898 gva_t gva; 4899 struct x86_exception e; 4900 struct { 4901 u64 vpid; 4902 u64 gla; 4903 } operand; 4904 u16 vpid02; 4905 4906 if (!(vmx->nested.msrs.secondary_ctls_high & 4907 SECONDARY_EXEC_ENABLE_VPID) || 4908 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 4909 kvm_queue_exception(vcpu, UD_VECTOR); 4910 return 1; 4911 } 4912 4913 if (!nested_vmx_check_permission(vcpu)) 4914 return 1; 4915 4916 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4917 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 4918 4919 types = (vmx->nested.msrs.vpid_caps & 4920 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 4921 4922 if (type >= 32 || !(types & (1 << type))) 4923 return nested_vmx_failValid(vcpu, 4924 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4925 4926 /* according to the intel vmx instruction reference, the memory 4927 * operand is read even if it isn't needed (e.g., for type==global) 4928 */ 4929 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 4930 vmx_instruction_info, false, sizeof(operand), &gva)) 4931 return 1; 4932 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 4933 kvm_inject_page_fault(vcpu, &e); 4934 return 1; 4935 } 4936 if (operand.vpid >> 16) 4937 return nested_vmx_failValid(vcpu, 4938 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4939 4940 vpid02 = nested_get_vpid02(vcpu); 4941 switch (type) { 4942 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 4943 if (!operand.vpid || 4944 is_noncanonical_address(operand.gla, vcpu)) 4945 return nested_vmx_failValid(vcpu, 4946 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4947 if (cpu_has_vmx_invvpid_individual_addr()) { 4948 __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, 4949 vpid02, operand.gla); 4950 } else 4951 __vmx_flush_tlb(vcpu, vpid02, false); 4952 break; 4953 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 4954 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 4955 if (!operand.vpid) 4956 return nested_vmx_failValid(vcpu, 4957 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 4958 __vmx_flush_tlb(vcpu, vpid02, false); 4959 break; 4960 case VMX_VPID_EXTENT_ALL_CONTEXT: 4961 __vmx_flush_tlb(vcpu, vpid02, false); 4962 break; 4963 default: 4964 WARN_ON_ONCE(1); 4965 return kvm_skip_emulated_instruction(vcpu); 4966 } 4967 4968 return nested_vmx_succeed(vcpu); 4969 } 4970 4971 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 4972 struct vmcs12 *vmcs12) 4973 { 4974 u32 index = kvm_rcx_read(vcpu); 4975 u64 address; 4976 bool accessed_dirty; 4977 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 4978 4979 if (!nested_cpu_has_eptp_switching(vmcs12) || 4980 !nested_cpu_has_ept(vmcs12)) 4981 return 1; 4982 4983 if (index >= VMFUNC_EPTP_ENTRIES) 4984 return 1; 4985 4986 4987 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 4988 &address, index * 8, 8)) 4989 return 1; 4990 4991 accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); 4992 4993 /* 4994 * If the (L2) guest does a vmfunc to the currently 4995 * active ept pointer, we don't have to do anything else 4996 */ 4997 if (vmcs12->ept_pointer != address) { 4998 if (!valid_ept_address(vcpu, address)) 4999 return 1; 5000 5001 kvm_mmu_unload(vcpu); 5002 mmu->ept_ad = accessed_dirty; 5003 mmu->mmu_role.base.ad_disabled = !accessed_dirty; 5004 vmcs12->ept_pointer = address; 5005 /* 5006 * TODO: Check what's the correct approach in case 5007 * mmu reload fails. Currently, we just let the next 5008 * reload potentially fail 5009 */ 5010 kvm_mmu_reload(vcpu); 5011 } 5012 5013 return 0; 5014 } 5015 5016 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5017 { 5018 struct vcpu_vmx *vmx = to_vmx(vcpu); 5019 struct vmcs12 *vmcs12; 5020 u32 function = kvm_rax_read(vcpu); 5021 5022 /* 5023 * VMFUNC is only supported for nested guests, but we always enable the 5024 * secondary control for simplicity; for non-nested mode, fake that we 5025 * didn't by injecting #UD. 5026 */ 5027 if (!is_guest_mode(vcpu)) { 5028 kvm_queue_exception(vcpu, UD_VECTOR); 5029 return 1; 5030 } 5031 5032 vmcs12 = get_vmcs12(vcpu); 5033 if ((vmcs12->vm_function_control & (1 << function)) == 0) 5034 goto fail; 5035 5036 switch (function) { 5037 case 0: 5038 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5039 goto fail; 5040 break; 5041 default: 5042 goto fail; 5043 } 5044 return kvm_skip_emulated_instruction(vcpu); 5045 5046 fail: 5047 nested_vmx_vmexit(vcpu, vmx->exit_reason, 5048 vmcs_read32(VM_EXIT_INTR_INFO), 5049 vmcs_readl(EXIT_QUALIFICATION)); 5050 return 1; 5051 } 5052 5053 5054 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5055 struct vmcs12 *vmcs12) 5056 { 5057 unsigned long exit_qualification; 5058 gpa_t bitmap, last_bitmap; 5059 unsigned int port; 5060 int size; 5061 u8 b; 5062 5063 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5064 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 5065 5066 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5067 5068 port = exit_qualification >> 16; 5069 size = (exit_qualification & 7) + 1; 5070 5071 last_bitmap = (gpa_t)-1; 5072 b = -1; 5073 5074 while (size > 0) { 5075 if (port < 0x8000) 5076 bitmap = vmcs12->io_bitmap_a; 5077 else if (port < 0x10000) 5078 bitmap = vmcs12->io_bitmap_b; 5079 else 5080 return true; 5081 bitmap += (port & 0x7fff) / 8; 5082 5083 if (last_bitmap != bitmap) 5084 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5085 return true; 5086 if (b & (1 << (port & 7))) 5087 return true; 5088 5089 port++; 5090 size--; 5091 last_bitmap = bitmap; 5092 } 5093 5094 return false; 5095 } 5096 5097 /* 5098 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 5099 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 5100 * disinterest in the current event (read or write a specific MSR) by using an 5101 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 5102 */ 5103 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 5104 struct vmcs12 *vmcs12, u32 exit_reason) 5105 { 5106 u32 msr_index = kvm_rcx_read(vcpu); 5107 gpa_t bitmap; 5108 5109 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 5110 return true; 5111 5112 /* 5113 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 5114 * for the four combinations of read/write and low/high MSR numbers. 5115 * First we need to figure out which of the four to use: 5116 */ 5117 bitmap = vmcs12->msr_bitmap; 5118 if (exit_reason == EXIT_REASON_MSR_WRITE) 5119 bitmap += 2048; 5120 if (msr_index >= 0xc0000000) { 5121 msr_index -= 0xc0000000; 5122 bitmap += 1024; 5123 } 5124 5125 /* Then read the msr_index'th bit from this bitmap: */ 5126 if (msr_index < 1024*8) { 5127 unsigned char b; 5128 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 5129 return true; 5130 return 1 & (b >> (msr_index & 7)); 5131 } else 5132 return true; /* let L1 handle the wrong parameter */ 5133 } 5134 5135 /* 5136 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 5137 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 5138 * intercept (via guest_host_mask etc.) the current event. 5139 */ 5140 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 5141 struct vmcs12 *vmcs12) 5142 { 5143 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5144 int cr = exit_qualification & 15; 5145 int reg; 5146 unsigned long val; 5147 5148 switch ((exit_qualification >> 4) & 3) { 5149 case 0: /* mov to cr */ 5150 reg = (exit_qualification >> 8) & 15; 5151 val = kvm_register_readl(vcpu, reg); 5152 switch (cr) { 5153 case 0: 5154 if (vmcs12->cr0_guest_host_mask & 5155 (val ^ vmcs12->cr0_read_shadow)) 5156 return true; 5157 break; 5158 case 3: 5159 if ((vmcs12->cr3_target_count >= 1 && 5160 vmcs12->cr3_target_value0 == val) || 5161 (vmcs12->cr3_target_count >= 2 && 5162 vmcs12->cr3_target_value1 == val) || 5163 (vmcs12->cr3_target_count >= 3 && 5164 vmcs12->cr3_target_value2 == val) || 5165 (vmcs12->cr3_target_count >= 4 && 5166 vmcs12->cr3_target_value3 == val)) 5167 return false; 5168 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 5169 return true; 5170 break; 5171 case 4: 5172 if (vmcs12->cr4_guest_host_mask & 5173 (vmcs12->cr4_read_shadow ^ val)) 5174 return true; 5175 break; 5176 case 8: 5177 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 5178 return true; 5179 break; 5180 } 5181 break; 5182 case 2: /* clts */ 5183 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 5184 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 5185 return true; 5186 break; 5187 case 1: /* mov from cr */ 5188 switch (cr) { 5189 case 3: 5190 if (vmcs12->cpu_based_vm_exec_control & 5191 CPU_BASED_CR3_STORE_EXITING) 5192 return true; 5193 break; 5194 case 8: 5195 if (vmcs12->cpu_based_vm_exec_control & 5196 CPU_BASED_CR8_STORE_EXITING) 5197 return true; 5198 break; 5199 } 5200 break; 5201 case 3: /* lmsw */ 5202 /* 5203 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 5204 * cr0. Other attempted changes are ignored, with no exit. 5205 */ 5206 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5207 if (vmcs12->cr0_guest_host_mask & 0xe & 5208 (val ^ vmcs12->cr0_read_shadow)) 5209 return true; 5210 if ((vmcs12->cr0_guest_host_mask & 0x1) && 5211 !(vmcs12->cr0_read_shadow & 0x1) && 5212 (val & 0x1)) 5213 return true; 5214 break; 5215 } 5216 return false; 5217 } 5218 5219 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 5220 struct vmcs12 *vmcs12, gpa_t bitmap) 5221 { 5222 u32 vmx_instruction_info; 5223 unsigned long field; 5224 u8 b; 5225 5226 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 5227 return true; 5228 5229 /* Decode instruction info and find the field to access */ 5230 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5231 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 5232 5233 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 5234 if (field >> 15) 5235 return true; 5236 5237 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 5238 return true; 5239 5240 return 1 & (b >> (field & 7)); 5241 } 5242 5243 /* 5244 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 5245 * should handle it ourselves in L0 (and then continue L2). Only call this 5246 * when in is_guest_mode (L2). 5247 */ 5248 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) 5249 { 5250 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 5251 struct vcpu_vmx *vmx = to_vmx(vcpu); 5252 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5253 5254 if (vmx->nested.nested_run_pending) 5255 return false; 5256 5257 if (unlikely(vmx->fail)) { 5258 pr_info_ratelimited("%s failed vm entry %x\n", __func__, 5259 vmcs_read32(VM_INSTRUCTION_ERROR)); 5260 return true; 5261 } 5262 5263 /* 5264 * The host physical addresses of some pages of guest memory 5265 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 5266 * Page). The CPU may write to these pages via their host 5267 * physical address while L2 is running, bypassing any 5268 * address-translation-based dirty tracking (e.g. EPT write 5269 * protection). 5270 * 5271 * Mark them dirty on every exit from L2 to prevent them from 5272 * getting out of sync with dirty tracking. 5273 */ 5274 nested_mark_vmcs12_pages_dirty(vcpu); 5275 5276 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 5277 vmcs_readl(EXIT_QUALIFICATION), 5278 vmx->idt_vectoring_info, 5279 intr_info, 5280 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 5281 KVM_ISA_VMX); 5282 5283 switch (exit_reason) { 5284 case EXIT_REASON_EXCEPTION_NMI: 5285 if (is_nmi(intr_info)) 5286 return false; 5287 else if (is_page_fault(intr_info)) 5288 return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; 5289 else if (is_debug(intr_info) && 5290 vcpu->guest_debug & 5291 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5292 return false; 5293 else if (is_breakpoint(intr_info) && 5294 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5295 return false; 5296 return vmcs12->exception_bitmap & 5297 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 5298 case EXIT_REASON_EXTERNAL_INTERRUPT: 5299 return false; 5300 case EXIT_REASON_TRIPLE_FAULT: 5301 return true; 5302 case EXIT_REASON_PENDING_INTERRUPT: 5303 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 5304 case EXIT_REASON_NMI_WINDOW: 5305 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 5306 case EXIT_REASON_TASK_SWITCH: 5307 return true; 5308 case EXIT_REASON_CPUID: 5309 return true; 5310 case EXIT_REASON_HLT: 5311 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 5312 case EXIT_REASON_INVD: 5313 return true; 5314 case EXIT_REASON_INVLPG: 5315 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5316 case EXIT_REASON_RDPMC: 5317 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 5318 case EXIT_REASON_RDRAND: 5319 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 5320 case EXIT_REASON_RDSEED: 5321 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 5322 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 5323 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 5324 case EXIT_REASON_VMREAD: 5325 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5326 vmcs12->vmread_bitmap); 5327 case EXIT_REASON_VMWRITE: 5328 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 5329 vmcs12->vmwrite_bitmap); 5330 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 5331 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 5332 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 5333 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 5334 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 5335 /* 5336 * VMX instructions trap unconditionally. This allows L1 to 5337 * emulate them for its L2 guest, i.e., allows 3-level nesting! 5338 */ 5339 return true; 5340 case EXIT_REASON_CR_ACCESS: 5341 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 5342 case EXIT_REASON_DR_ACCESS: 5343 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 5344 case EXIT_REASON_IO_INSTRUCTION: 5345 return nested_vmx_exit_handled_io(vcpu, vmcs12); 5346 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 5347 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 5348 case EXIT_REASON_MSR_READ: 5349 case EXIT_REASON_MSR_WRITE: 5350 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 5351 case EXIT_REASON_INVALID_STATE: 5352 return true; 5353 case EXIT_REASON_MWAIT_INSTRUCTION: 5354 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 5355 case EXIT_REASON_MONITOR_TRAP_FLAG: 5356 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); 5357 case EXIT_REASON_MONITOR_INSTRUCTION: 5358 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 5359 case EXIT_REASON_PAUSE_INSTRUCTION: 5360 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 5361 nested_cpu_has2(vmcs12, 5362 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 5363 case EXIT_REASON_MCE_DURING_VMENTRY: 5364 return false; 5365 case EXIT_REASON_TPR_BELOW_THRESHOLD: 5366 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 5367 case EXIT_REASON_APIC_ACCESS: 5368 case EXIT_REASON_APIC_WRITE: 5369 case EXIT_REASON_EOI_INDUCED: 5370 /* 5371 * The controls for "virtualize APIC accesses," "APIC- 5372 * register virtualization," and "virtual-interrupt 5373 * delivery" only come from vmcs12. 5374 */ 5375 return true; 5376 case EXIT_REASON_EPT_VIOLATION: 5377 /* 5378 * L0 always deals with the EPT violation. If nested EPT is 5379 * used, and the nested mmu code discovers that the address is 5380 * missing in the guest EPT table (EPT12), the EPT violation 5381 * will be injected with nested_ept_inject_page_fault() 5382 */ 5383 return false; 5384 case EXIT_REASON_EPT_MISCONFIG: 5385 /* 5386 * L2 never uses directly L1's EPT, but rather L0's own EPT 5387 * table (shadow on EPT) or a merged EPT table that L0 built 5388 * (EPT on EPT). So any problems with the structure of the 5389 * table is L0's fault. 5390 */ 5391 return false; 5392 case EXIT_REASON_INVPCID: 5393 return 5394 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 5395 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 5396 case EXIT_REASON_WBINVD: 5397 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 5398 case EXIT_REASON_XSETBV: 5399 return true; 5400 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 5401 /* 5402 * This should never happen, since it is not possible to 5403 * set XSS to a non-zero value---neither in L1 nor in L2. 5404 * If if it were, XSS would have to be checked against 5405 * the XSS exit bitmap in vmcs12. 5406 */ 5407 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 5408 case EXIT_REASON_PREEMPTION_TIMER: 5409 return false; 5410 case EXIT_REASON_PML_FULL: 5411 /* We emulate PML support to L1. */ 5412 return false; 5413 case EXIT_REASON_VMFUNC: 5414 /* VM functions are emulated through L2->L0 vmexits. */ 5415 return false; 5416 case EXIT_REASON_ENCLS: 5417 /* SGX is never exposed to L1 */ 5418 return false; 5419 default: 5420 return true; 5421 } 5422 } 5423 5424 5425 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 5426 struct kvm_nested_state __user *user_kvm_nested_state, 5427 u32 user_data_size) 5428 { 5429 struct vcpu_vmx *vmx; 5430 struct vmcs12 *vmcs12; 5431 struct kvm_nested_state kvm_state = { 5432 .flags = 0, 5433 .format = KVM_STATE_NESTED_FORMAT_VMX, 5434 .size = sizeof(kvm_state), 5435 .hdr.vmx.vmxon_pa = -1ull, 5436 .hdr.vmx.vmcs12_pa = -1ull, 5437 }; 5438 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5439 &user_kvm_nested_state->data.vmx[0]; 5440 5441 if (!vcpu) 5442 return kvm_state.size + sizeof(*user_vmx_nested_state); 5443 5444 vmx = to_vmx(vcpu); 5445 vmcs12 = get_vmcs12(vcpu); 5446 5447 if (nested_vmx_allowed(vcpu) && 5448 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 5449 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 5450 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 5451 5452 if (vmx_has_valid_vmcs12(vcpu)) { 5453 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 5454 5455 if (vmx->nested.hv_evmcs) 5456 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 5457 5458 if (is_guest_mode(vcpu) && 5459 nested_cpu_has_shadow_vmcs(vmcs12) && 5460 vmcs12->vmcs_link_pointer != -1ull) 5461 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 5462 } 5463 5464 if (vmx->nested.smm.vmxon) 5465 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 5466 5467 if (vmx->nested.smm.guest_mode) 5468 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 5469 5470 if (is_guest_mode(vcpu)) { 5471 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 5472 5473 if (vmx->nested.nested_run_pending) 5474 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 5475 } 5476 } 5477 5478 if (user_data_size < kvm_state.size) 5479 goto out; 5480 5481 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 5482 return -EFAULT; 5483 5484 if (!vmx_has_valid_vmcs12(vcpu)) 5485 goto out; 5486 5487 /* 5488 * When running L2, the authoritative vmcs12 state is in the 5489 * vmcs02. When running L1, the authoritative vmcs12 state is 5490 * in the shadow or enlightened vmcs linked to vmcs01, unless 5491 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 5492 * vmcs12 state is in the vmcs12 already. 5493 */ 5494 if (is_guest_mode(vcpu)) { 5495 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5496 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5497 } else if (!vmx->nested.need_vmcs12_to_shadow_sync) { 5498 if (vmx->nested.hv_evmcs) 5499 copy_enlightened_to_vmcs12(vmx); 5500 else if (enable_shadow_vmcs) 5501 copy_shadow_to_vmcs12(vmx); 5502 } 5503 5504 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 5505 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 5506 5507 /* 5508 * Copy over the full allocated size of vmcs12 rather than just the size 5509 * of the struct. 5510 */ 5511 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 5512 return -EFAULT; 5513 5514 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5515 vmcs12->vmcs_link_pointer != -1ull) { 5516 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 5517 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 5518 return -EFAULT; 5519 } 5520 5521 out: 5522 return kvm_state.size; 5523 } 5524 5525 /* 5526 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 5527 */ 5528 void vmx_leave_nested(struct kvm_vcpu *vcpu) 5529 { 5530 if (is_guest_mode(vcpu)) { 5531 to_vmx(vcpu)->nested.nested_run_pending = 0; 5532 nested_vmx_vmexit(vcpu, -1, 0, 0); 5533 } 5534 free_nested(vcpu); 5535 } 5536 5537 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 5538 struct kvm_nested_state __user *user_kvm_nested_state, 5539 struct kvm_nested_state *kvm_state) 5540 { 5541 struct vcpu_vmx *vmx = to_vmx(vcpu); 5542 struct vmcs12 *vmcs12; 5543 u32 exit_qual; 5544 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 5545 &user_kvm_nested_state->data.vmx[0]; 5546 int ret; 5547 5548 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 5549 return -EINVAL; 5550 5551 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { 5552 if (kvm_state->hdr.vmx.smm.flags) 5553 return -EINVAL; 5554 5555 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) 5556 return -EINVAL; 5557 5558 /* 5559 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 5560 * enable eVMCS capability on vCPU. However, since then 5561 * code was changed such that flag signals vmcs12 should 5562 * be copied into eVMCS in guest memory. 5563 * 5564 * To preserve backwards compatability, allow user 5565 * to set this flag even when there is no VMXON region. 5566 */ 5567 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 5568 return -EINVAL; 5569 } else { 5570 if (!nested_vmx_allowed(vcpu)) 5571 return -EINVAL; 5572 5573 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 5574 return -EINVAL; 5575 } 5576 5577 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5578 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5579 return -EINVAL; 5580 5581 if (kvm_state->hdr.vmx.smm.flags & 5582 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 5583 return -EINVAL; 5584 5585 /* 5586 * SMM temporarily disables VMX, so we cannot be in guest mode, 5587 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 5588 * must be zero. 5589 */ 5590 if (is_smm(vcpu) ? 5591 (kvm_state->flags & 5592 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 5593 : kvm_state->hdr.vmx.smm.flags) 5594 return -EINVAL; 5595 5596 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 5597 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 5598 return -EINVAL; 5599 5600 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 5601 (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) 5602 return -EINVAL; 5603 5604 vmx_leave_nested(vcpu); 5605 5606 if (kvm_state->hdr.vmx.vmxon_pa == -1ull) 5607 return 0; 5608 5609 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 5610 ret = enter_vmx_operation(vcpu); 5611 if (ret) 5612 return ret; 5613 5614 /* Empty 'VMXON' state is permitted */ 5615 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) 5616 return 0; 5617 5618 if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { 5619 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 5620 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 5621 return -EINVAL; 5622 5623 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 5624 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 5625 /* 5626 * Sync eVMCS upon entry as we may not have 5627 * HV_X64_MSR_VP_ASSIST_PAGE set up yet. 5628 */ 5629 vmx->nested.need_vmcs12_to_shadow_sync = true; 5630 } else { 5631 return -EINVAL; 5632 } 5633 5634 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 5635 vmx->nested.smm.vmxon = true; 5636 vmx->nested.vmxon = false; 5637 5638 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 5639 vmx->nested.smm.guest_mode = true; 5640 } 5641 5642 vmcs12 = get_vmcs12(vcpu); 5643 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 5644 return -EFAULT; 5645 5646 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 5647 return -EINVAL; 5648 5649 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 5650 return 0; 5651 5652 vmx->nested.nested_run_pending = 5653 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 5654 5655 ret = -EINVAL; 5656 if (nested_cpu_has_shadow_vmcs(vmcs12) && 5657 vmcs12->vmcs_link_pointer != -1ull) { 5658 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 5659 5660 if (kvm_state->size < 5661 sizeof(*kvm_state) + 5662 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 5663 goto error_guest_mode; 5664 5665 if (copy_from_user(shadow_vmcs12, 5666 user_vmx_nested_state->shadow_vmcs12, 5667 sizeof(*shadow_vmcs12))) { 5668 ret = -EFAULT; 5669 goto error_guest_mode; 5670 } 5671 5672 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 5673 !shadow_vmcs12->hdr.shadow_vmcs) 5674 goto error_guest_mode; 5675 } 5676 5677 if (nested_vmx_check_controls(vcpu, vmcs12) || 5678 nested_vmx_check_host_state(vcpu, vmcs12) || 5679 nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) 5680 goto error_guest_mode; 5681 5682 vmx->nested.dirty_vmcs12 = true; 5683 ret = nested_vmx_enter_non_root_mode(vcpu, false); 5684 if (ret) 5685 goto error_guest_mode; 5686 5687 return 0; 5688 5689 error_guest_mode: 5690 vmx->nested.nested_run_pending = 0; 5691 return ret; 5692 } 5693 5694 void nested_vmx_vcpu_setup(void) 5695 { 5696 if (enable_shadow_vmcs) { 5697 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 5698 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 5699 } 5700 } 5701 5702 /* 5703 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 5704 * returned for the various VMX controls MSRs when nested VMX is enabled. 5705 * The same values should also be used to verify that vmcs12 control fields are 5706 * valid during nested entry from L1 to L2. 5707 * Each of these control msrs has a low and high 32-bit half: A low bit is on 5708 * if the corresponding bit in the (32-bit) control field *must* be on, and a 5709 * bit in the high half is on if the corresponding bit in the control field 5710 * may be on. See also vmx_control_verify(). 5711 */ 5712 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, 5713 bool apicv) 5714 { 5715 /* 5716 * Note that as a general rule, the high half of the MSRs (bits in 5717 * the control fields which may be 1) should be initialized by the 5718 * intersection of the underlying hardware's MSR (i.e., features which 5719 * can be supported) and the list of features we want to expose - 5720 * because they are known to be properly supported in our code. 5721 * Also, usually, the low half of the MSRs (bits which must be 1) can 5722 * be set to 0, meaning that L1 may turn off any of these bits. The 5723 * reason is that if one of these bits is necessary, it will appear 5724 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 5725 * fields of vmcs01 and vmcs02, will turn these bits off - and 5726 * nested_vmx_exit_reflected() will not pass related exits to L1. 5727 * These rules have exceptions below. 5728 */ 5729 5730 /* pin-based controls */ 5731 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 5732 msrs->pinbased_ctls_low, 5733 msrs->pinbased_ctls_high); 5734 msrs->pinbased_ctls_low |= 5735 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5736 msrs->pinbased_ctls_high &= 5737 PIN_BASED_EXT_INTR_MASK | 5738 PIN_BASED_NMI_EXITING | 5739 PIN_BASED_VIRTUAL_NMIS | 5740 (apicv ? PIN_BASED_POSTED_INTR : 0); 5741 msrs->pinbased_ctls_high |= 5742 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 5743 PIN_BASED_VMX_PREEMPTION_TIMER; 5744 5745 /* exit controls */ 5746 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 5747 msrs->exit_ctls_low, 5748 msrs->exit_ctls_high); 5749 msrs->exit_ctls_low = 5750 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 5751 5752 msrs->exit_ctls_high &= 5753 #ifdef CONFIG_X86_64 5754 VM_EXIT_HOST_ADDR_SPACE_SIZE | 5755 #endif 5756 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 5757 msrs->exit_ctls_high |= 5758 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 5759 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 5760 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 5761 5762 /* We support free control of debug control saving. */ 5763 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 5764 5765 /* entry controls */ 5766 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 5767 msrs->entry_ctls_low, 5768 msrs->entry_ctls_high); 5769 msrs->entry_ctls_low = 5770 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 5771 msrs->entry_ctls_high &= 5772 #ifdef CONFIG_X86_64 5773 VM_ENTRY_IA32E_MODE | 5774 #endif 5775 VM_ENTRY_LOAD_IA32_PAT; 5776 msrs->entry_ctls_high |= 5777 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 5778 5779 /* We support free control of debug control loading. */ 5780 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 5781 5782 /* cpu-based controls */ 5783 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 5784 msrs->procbased_ctls_low, 5785 msrs->procbased_ctls_high); 5786 msrs->procbased_ctls_low = 5787 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 5788 msrs->procbased_ctls_high &= 5789 CPU_BASED_VIRTUAL_INTR_PENDING | 5790 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 5791 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 5792 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 5793 CPU_BASED_CR3_STORE_EXITING | 5794 #ifdef CONFIG_X86_64 5795 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 5796 #endif 5797 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 5798 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 5799 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 5800 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 5801 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 5802 /* 5803 * We can allow some features even when not supported by the 5804 * hardware. For example, L1 can specify an MSR bitmap - and we 5805 * can use it to avoid exits to L1 - even when L0 runs L2 5806 * without MSR bitmaps. 5807 */ 5808 msrs->procbased_ctls_high |= 5809 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 5810 CPU_BASED_USE_MSR_BITMAPS; 5811 5812 /* We support free control of CR3 access interception. */ 5813 msrs->procbased_ctls_low &= 5814 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 5815 5816 /* 5817 * secondary cpu-based controls. Do not include those that 5818 * depend on CPUID bits, they are added later by vmx_cpuid_update. 5819 */ 5820 if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) 5821 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 5822 msrs->secondary_ctls_low, 5823 msrs->secondary_ctls_high); 5824 5825 msrs->secondary_ctls_low = 0; 5826 msrs->secondary_ctls_high &= 5827 SECONDARY_EXEC_DESC | 5828 SECONDARY_EXEC_RDTSCP | 5829 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 5830 SECONDARY_EXEC_WBINVD_EXITING | 5831 SECONDARY_EXEC_APIC_REGISTER_VIRT | 5832 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 5833 SECONDARY_EXEC_RDRAND_EXITING | 5834 SECONDARY_EXEC_ENABLE_INVPCID | 5835 SECONDARY_EXEC_RDSEED_EXITING | 5836 SECONDARY_EXEC_XSAVES; 5837 5838 /* 5839 * We can emulate "VMCS shadowing," even if the hardware 5840 * doesn't support it. 5841 */ 5842 msrs->secondary_ctls_high |= 5843 SECONDARY_EXEC_SHADOW_VMCS; 5844 5845 if (enable_ept) { 5846 /* nested EPT: emulate EPT also to L1 */ 5847 msrs->secondary_ctls_high |= 5848 SECONDARY_EXEC_ENABLE_EPT; 5849 msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 5850 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; 5851 if (cpu_has_vmx_ept_execute_only()) 5852 msrs->ept_caps |= 5853 VMX_EPT_EXECUTE_ONLY_BIT; 5854 msrs->ept_caps &= ept_caps; 5855 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 5856 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 5857 VMX_EPT_1GB_PAGE_BIT; 5858 if (enable_ept_ad_bits) { 5859 msrs->secondary_ctls_high |= 5860 SECONDARY_EXEC_ENABLE_PML; 5861 msrs->ept_caps |= VMX_EPT_AD_BIT; 5862 } 5863 } 5864 5865 if (cpu_has_vmx_vmfunc()) { 5866 msrs->secondary_ctls_high |= 5867 SECONDARY_EXEC_ENABLE_VMFUNC; 5868 /* 5869 * Advertise EPTP switching unconditionally 5870 * since we emulate it 5871 */ 5872 if (enable_ept) 5873 msrs->vmfunc_controls = 5874 VMX_VMFUNC_EPTP_SWITCHING; 5875 } 5876 5877 /* 5878 * Old versions of KVM use the single-context version without 5879 * checking for support, so declare that it is supported even 5880 * though it is treated as global context. The alternative is 5881 * not failing the single-context invvpid, and it is worse. 5882 */ 5883 if (enable_vpid) { 5884 msrs->secondary_ctls_high |= 5885 SECONDARY_EXEC_ENABLE_VPID; 5886 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 5887 VMX_VPID_EXTENT_SUPPORTED_MASK; 5888 } 5889 5890 if (enable_unrestricted_guest) 5891 msrs->secondary_ctls_high |= 5892 SECONDARY_EXEC_UNRESTRICTED_GUEST; 5893 5894 if (flexpriority_enabled) 5895 msrs->secondary_ctls_high |= 5896 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 5897 5898 /* miscellaneous data */ 5899 rdmsr(MSR_IA32_VMX_MISC, 5900 msrs->misc_low, 5901 msrs->misc_high); 5902 msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA; 5903 msrs->misc_low |= 5904 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 5905 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 5906 VMX_MISC_ACTIVITY_HLT; 5907 msrs->misc_high = 0; 5908 5909 /* 5910 * This MSR reports some information about VMX support. We 5911 * should return information about the VMX we emulate for the 5912 * guest, and the VMCS structure we give it - not about the 5913 * VMX support of the underlying hardware. 5914 */ 5915 msrs->basic = 5916 VMCS12_REVISION | 5917 VMX_BASIC_TRUE_CTLS | 5918 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 5919 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 5920 5921 if (cpu_has_vmx_basic_inout()) 5922 msrs->basic |= VMX_BASIC_INOUT; 5923 5924 /* 5925 * These MSRs specify bits which the guest must keep fixed on 5926 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 5927 * We picked the standard core2 setting. 5928 */ 5929 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 5930 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 5931 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 5932 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 5933 5934 /* These MSRs specify bits which the guest must keep fixed off. */ 5935 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 5936 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 5937 5938 /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 5939 msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; 5940 } 5941 5942 void nested_vmx_hardware_unsetup(void) 5943 { 5944 int i; 5945 5946 if (enable_shadow_vmcs) { 5947 for (i = 0; i < VMX_BITMAP_NR; i++) 5948 free_page((unsigned long)vmx_bitmap[i]); 5949 } 5950 } 5951 5952 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 5953 { 5954 int i; 5955 5956 if (!cpu_has_vmx_shadow_vmcs()) 5957 enable_shadow_vmcs = 0; 5958 if (enable_shadow_vmcs) { 5959 for (i = 0; i < VMX_BITMAP_NR; i++) { 5960 /* 5961 * The vmx_bitmap is not tied to a VM and so should 5962 * not be charged to a memcg. 5963 */ 5964 vmx_bitmap[i] = (unsigned long *) 5965 __get_free_page(GFP_KERNEL); 5966 if (!vmx_bitmap[i]) { 5967 nested_vmx_hardware_unsetup(); 5968 return -ENOMEM; 5969 } 5970 } 5971 5972 init_vmcs_shadow_fields(); 5973 } 5974 5975 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, 5976 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 5977 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, 5978 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, 5979 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, 5980 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, 5981 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, 5982 exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, 5983 exit_handlers[EXIT_REASON_VMON] = handle_vmon, 5984 exit_handlers[EXIT_REASON_INVEPT] = handle_invept, 5985 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, 5986 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, 5987 5988 kvm_x86_ops->check_nested_events = vmx_check_nested_events; 5989 kvm_x86_ops->get_nested_state = vmx_get_nested_state; 5990 kvm_x86_ops->set_nested_state = vmx_set_nested_state; 5991 kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, 5992 kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; 5993 kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; 5994 5995 return 0; 5996 } 5997