1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/objtool.h> 26 #include <linux/sched.h> 27 #include <linux/sched/smt.h> 28 #include <linux/slab.h> 29 #include <linux/tboot.h> 30 #include <linux/trace_events.h> 31 #include <linux/entry-kvm.h> 32 33 #include <asm/apic.h> 34 #include <asm/asm.h> 35 #include <asm/cpu.h> 36 #include <asm/cpu_device_id.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/api.h> 40 #include <asm/fpu/xstate.h> 41 #include <asm/idtentry.h> 42 #include <asm/io.h> 43 #include <asm/irq_remapping.h> 44 #include <asm/reboot.h> 45 #include <asm/perf_event.h> 46 #include <asm/mmu_context.h> 47 #include <asm/mshyperv.h> 48 #include <asm/mwait.h> 49 #include <asm/spec-ctrl.h> 50 #include <asm/vmx.h> 51 52 #include "capabilities.h" 53 #include "cpuid.h" 54 #include "hyperv.h" 55 #include "kvm_onhyperv.h" 56 #include "irq.h" 57 #include "kvm_cache_regs.h" 58 #include "lapic.h" 59 #include "mmu.h" 60 #include "nested.h" 61 #include "pmu.h" 62 #include "sgx.h" 63 #include "trace.h" 64 #include "vmcs.h" 65 #include "vmcs12.h" 66 #include "vmx.h" 67 #include "x86.h" 68 #include "smm.h" 69 #include "vmx_onhyperv.h" 70 #include "posted_intr.h" 71 72 MODULE_AUTHOR("Qumranet"); 73 MODULE_LICENSE("GPL"); 74 75 #ifdef MODULE 76 static const struct x86_cpu_id vmx_cpu_id[] = { 77 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 78 {} 79 }; 80 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 81 #endif 82 83 bool __read_mostly enable_vpid = 1; 84 module_param_named(vpid, enable_vpid, bool, 0444); 85 86 static bool __read_mostly enable_vnmi = 1; 87 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); 88 89 bool __read_mostly flexpriority_enabled = 1; 90 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 91 92 bool __read_mostly enable_ept = 1; 93 module_param_named(ept, enable_ept, bool, S_IRUGO); 94 95 bool __read_mostly enable_unrestricted_guest = 1; 96 module_param_named(unrestricted_guest, 97 enable_unrestricted_guest, bool, S_IRUGO); 98 99 bool __read_mostly enable_ept_ad_bits = 1; 100 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); 101 102 static bool __read_mostly emulate_invalid_guest_state = true; 103 module_param(emulate_invalid_guest_state, bool, S_IRUGO); 104 105 static bool __read_mostly fasteoi = 1; 106 module_param(fasteoi, bool, S_IRUGO); 107 108 module_param(enable_apicv, bool, S_IRUGO); 109 110 bool __read_mostly enable_ipiv = true; 111 module_param(enable_ipiv, bool, 0444); 112 113 /* 114 * If nested=1, nested virtualization is supported, i.e., guests may use 115 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 116 * use VMX instructions. 117 */ 118 static bool __read_mostly nested = 1; 119 module_param(nested, bool, S_IRUGO); 120 121 bool __read_mostly enable_pml = 1; 122 module_param_named(pml, enable_pml, bool, S_IRUGO); 123 124 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 125 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 126 127 static bool __read_mostly dump_invalid_vmcs = 0; 128 module_param(dump_invalid_vmcs, bool, 0644); 129 130 #define MSR_BITMAP_MODE_X2APIC 1 131 #define MSR_BITMAP_MODE_X2APIC_APICV 2 132 133 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 134 135 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 136 static int __read_mostly cpu_preemption_timer_multi; 137 static bool __read_mostly enable_preemption_timer = 1; 138 #ifdef CONFIG_X86_64 139 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 140 #endif 141 142 extern bool __read_mostly allow_smaller_maxphyaddr; 143 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 144 145 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 146 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 147 #define KVM_VM_CR0_ALWAYS_ON \ 148 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 149 150 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 151 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 152 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 153 154 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 155 156 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 157 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 158 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 159 RTIT_STATUS_BYTECNT)) 160 161 /* 162 * List of MSRs that can be directly passed to the guest. 163 * In addition to these x2apic and PT MSRs are handled specially. 164 */ 165 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 166 MSR_IA32_SPEC_CTRL, 167 MSR_IA32_PRED_CMD, 168 MSR_IA32_FLUSH_CMD, 169 MSR_IA32_TSC, 170 #ifdef CONFIG_X86_64 171 MSR_FS_BASE, 172 MSR_GS_BASE, 173 MSR_KERNEL_GS_BASE, 174 MSR_IA32_XFD, 175 MSR_IA32_XFD_ERR, 176 #endif 177 MSR_IA32_SYSENTER_CS, 178 MSR_IA32_SYSENTER_ESP, 179 MSR_IA32_SYSENTER_EIP, 180 MSR_CORE_C1_RES, 181 MSR_CORE_C3_RESIDENCY, 182 MSR_CORE_C6_RESIDENCY, 183 MSR_CORE_C7_RESIDENCY, 184 }; 185 186 /* 187 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 188 * ple_gap: upper bound on the amount of time between two successive 189 * executions of PAUSE in a loop. Also indicate if ple enabled. 190 * According to test, this time is usually smaller than 128 cycles. 191 * ple_window: upper bound on the amount of time a guest is allowed to execute 192 * in a PAUSE loop. Tests indicate that most spinlocks are held for 193 * less than 2^12 cycles 194 * Time is measured based on a counter that runs at the same rate as the TSC, 195 * refer SDM volume 3b section 21.6.13 & 22.1.3. 196 */ 197 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 198 module_param(ple_gap, uint, 0444); 199 200 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 201 module_param(ple_window, uint, 0444); 202 203 /* Default doubles per-vcpu window every exit. */ 204 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 205 module_param(ple_window_grow, uint, 0444); 206 207 /* Default resets per-vcpu window every exit to ple_window. */ 208 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 209 module_param(ple_window_shrink, uint, 0444); 210 211 /* Default is to compute the maximum so we can never overflow. */ 212 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 213 module_param(ple_window_max, uint, 0444); 214 215 /* Default is SYSTEM mode, 1 for host-guest mode */ 216 int __read_mostly pt_mode = PT_MODE_SYSTEM; 217 module_param(pt_mode, int, S_IRUGO); 218 219 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 220 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 221 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 222 223 /* Storage for pre module init parameter parsing */ 224 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 225 226 static const struct { 227 const char *option; 228 bool for_parse; 229 } vmentry_l1d_param[] = { 230 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 231 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 232 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 233 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 234 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 235 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 236 }; 237 238 #define L1D_CACHE_ORDER 4 239 static void *vmx_l1d_flush_pages; 240 241 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 242 { 243 struct page *page; 244 unsigned int i; 245 246 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 247 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 248 return 0; 249 } 250 251 if (!enable_ept) { 252 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 253 return 0; 254 } 255 256 if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 257 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 258 return 0; 259 } 260 261 /* If set to auto use the default l1tf mitigation method */ 262 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 263 switch (l1tf_mitigation) { 264 case L1TF_MITIGATION_OFF: 265 l1tf = VMENTER_L1D_FLUSH_NEVER; 266 break; 267 case L1TF_MITIGATION_FLUSH_NOWARN: 268 case L1TF_MITIGATION_FLUSH: 269 case L1TF_MITIGATION_FLUSH_NOSMT: 270 l1tf = VMENTER_L1D_FLUSH_COND; 271 break; 272 case L1TF_MITIGATION_FULL: 273 case L1TF_MITIGATION_FULL_FORCE: 274 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 275 break; 276 } 277 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 278 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 279 } 280 281 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 282 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 283 /* 284 * This allocation for vmx_l1d_flush_pages is not tied to a VM 285 * lifetime and so should not be charged to a memcg. 286 */ 287 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 288 if (!page) 289 return -ENOMEM; 290 vmx_l1d_flush_pages = page_address(page); 291 292 /* 293 * Initialize each page with a different pattern in 294 * order to protect against KSM in the nested 295 * virtualization case. 296 */ 297 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 298 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 299 PAGE_SIZE); 300 } 301 } 302 303 l1tf_vmx_mitigation = l1tf; 304 305 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 306 static_branch_enable(&vmx_l1d_should_flush); 307 else 308 static_branch_disable(&vmx_l1d_should_flush); 309 310 if (l1tf == VMENTER_L1D_FLUSH_COND) 311 static_branch_enable(&vmx_l1d_flush_cond); 312 else 313 static_branch_disable(&vmx_l1d_flush_cond); 314 return 0; 315 } 316 317 static int vmentry_l1d_flush_parse(const char *s) 318 { 319 unsigned int i; 320 321 if (s) { 322 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 323 if (vmentry_l1d_param[i].for_parse && 324 sysfs_streq(s, vmentry_l1d_param[i].option)) 325 return i; 326 } 327 } 328 return -EINVAL; 329 } 330 331 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 332 { 333 int l1tf, ret; 334 335 l1tf = vmentry_l1d_flush_parse(s); 336 if (l1tf < 0) 337 return l1tf; 338 339 if (!boot_cpu_has(X86_BUG_L1TF)) 340 return 0; 341 342 /* 343 * Has vmx_init() run already? If not then this is the pre init 344 * parameter parsing. In that case just store the value and let 345 * vmx_init() do the proper setup after enable_ept has been 346 * established. 347 */ 348 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 349 vmentry_l1d_flush_param = l1tf; 350 return 0; 351 } 352 353 mutex_lock(&vmx_l1d_flush_mutex); 354 ret = vmx_setup_l1d_flush(l1tf); 355 mutex_unlock(&vmx_l1d_flush_mutex); 356 return ret; 357 } 358 359 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 360 { 361 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 362 return sysfs_emit(s, "???\n"); 363 364 return sysfs_emit(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 365 } 366 367 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 368 { 369 u64 msr; 370 371 if (!vmx->disable_fb_clear) 372 return; 373 374 msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); 375 msr |= FB_CLEAR_DIS; 376 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); 377 /* Cache the MSR value to avoid reading it later */ 378 vmx->msr_ia32_mcu_opt_ctrl = msr; 379 } 380 381 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 382 { 383 if (!vmx->disable_fb_clear) 384 return; 385 386 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 387 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 388 } 389 390 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 391 { 392 /* 393 * Disable VERW's behavior of clearing CPU buffers for the guest if the 394 * CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled 395 * the mitigation. Disabling the clearing behavior provides a 396 * performance boost for guests that aren't aware that manually clearing 397 * CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry 398 * and VM-Exit. 399 */ 400 vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) && 401 (host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) && 402 !boot_cpu_has_bug(X86_BUG_MDS) && 403 !boot_cpu_has_bug(X86_BUG_TAA); 404 405 /* 406 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 407 * at VMEntry. Skip the MSR read/write when a guest has no use case to 408 * execute VERW. 409 */ 410 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 411 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 412 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 413 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 414 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 415 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 416 vmx->disable_fb_clear = false; 417 } 418 419 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 420 .set = vmentry_l1d_flush_set, 421 .get = vmentry_l1d_flush_get, 422 }; 423 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 424 425 static u32 vmx_segment_access_rights(struct kvm_segment *var); 426 427 void vmx_vmexit(void); 428 429 #define vmx_insn_failed(fmt...) \ 430 do { \ 431 WARN_ONCE(1, fmt); \ 432 pr_warn_ratelimited(fmt); \ 433 } while (0) 434 435 noinline void vmread_error(unsigned long field) 436 { 437 vmx_insn_failed("vmread failed: field=%lx\n", field); 438 } 439 440 #ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT 441 noinstr void vmread_error_trampoline2(unsigned long field, bool fault) 442 { 443 if (fault) { 444 kvm_spurious_fault(); 445 } else { 446 instrumentation_begin(); 447 vmread_error(field); 448 instrumentation_end(); 449 } 450 } 451 #endif 452 453 noinline void vmwrite_error(unsigned long field, unsigned long value) 454 { 455 vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n", 456 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 457 } 458 459 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 460 { 461 vmx_insn_failed("vmclear failed: %p/%llx err=%u\n", 462 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 463 } 464 465 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 466 { 467 vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n", 468 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 469 } 470 471 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 472 { 473 vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 474 ext, vpid, gva); 475 } 476 477 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) 478 { 479 vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", 480 ext, eptp, gpa); 481 } 482 483 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 484 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 485 /* 486 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 487 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 488 */ 489 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 490 491 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 492 static DEFINE_SPINLOCK(vmx_vpid_lock); 493 494 struct vmcs_config vmcs_config __ro_after_init; 495 struct vmx_capability vmx_capability __ro_after_init; 496 497 #define VMX_SEGMENT_FIELD(seg) \ 498 [VCPU_SREG_##seg] = { \ 499 .selector = GUEST_##seg##_SELECTOR, \ 500 .base = GUEST_##seg##_BASE, \ 501 .limit = GUEST_##seg##_LIMIT, \ 502 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 503 } 504 505 static const struct kvm_vmx_segment_field { 506 unsigned selector; 507 unsigned base; 508 unsigned limit; 509 unsigned ar_bytes; 510 } kvm_vmx_segment_fields[] = { 511 VMX_SEGMENT_FIELD(CS), 512 VMX_SEGMENT_FIELD(DS), 513 VMX_SEGMENT_FIELD(ES), 514 VMX_SEGMENT_FIELD(FS), 515 VMX_SEGMENT_FIELD(GS), 516 VMX_SEGMENT_FIELD(SS), 517 VMX_SEGMENT_FIELD(TR), 518 VMX_SEGMENT_FIELD(LDTR), 519 }; 520 521 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 522 { 523 vmx->segment_cache.bitmask = 0; 524 } 525 526 static unsigned long host_idt_base; 527 528 #if IS_ENABLED(CONFIG_HYPERV) 529 static struct kvm_x86_ops vmx_x86_ops __initdata; 530 531 static bool __read_mostly enlightened_vmcs = true; 532 module_param(enlightened_vmcs, bool, 0444); 533 534 static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu) 535 { 536 struct hv_enlightened_vmcs *evmcs; 537 struct hv_partition_assist_pg **p_hv_pa_pg = 538 &to_kvm_hv(vcpu->kvm)->hv_pa_pg; 539 /* 540 * Synthetic VM-Exit is not enabled in current code and so All 541 * evmcs in singe VM shares same assist page. 542 */ 543 if (!*p_hv_pa_pg) 544 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); 545 546 if (!*p_hv_pa_pg) 547 return -ENOMEM; 548 549 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 550 551 evmcs->partition_assist_page = 552 __pa(*p_hv_pa_pg); 553 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 554 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 555 556 return 0; 557 } 558 559 static __init void hv_init_evmcs(void) 560 { 561 int cpu; 562 563 if (!enlightened_vmcs) 564 return; 565 566 /* 567 * Enlightened VMCS usage should be recommended and the host needs 568 * to support eVMCS v1 or above. 569 */ 570 if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 571 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 572 KVM_EVMCS_VERSION) { 573 574 /* Check that we have assist pages on all online CPUs */ 575 for_each_online_cpu(cpu) { 576 if (!hv_get_vp_assist_page(cpu)) { 577 enlightened_vmcs = false; 578 break; 579 } 580 } 581 582 if (enlightened_vmcs) { 583 pr_info("Using Hyper-V Enlightened VMCS\n"); 584 static_branch_enable(&__kvm_is_using_evmcs); 585 } 586 587 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 588 vmx_x86_ops.enable_l2_tlb_flush 589 = hv_enable_l2_tlb_flush; 590 591 } else { 592 enlightened_vmcs = false; 593 } 594 } 595 596 static void hv_reset_evmcs(void) 597 { 598 struct hv_vp_assist_page *vp_ap; 599 600 if (!kvm_is_using_evmcs()) 601 return; 602 603 /* 604 * KVM should enable eVMCS if and only if all CPUs have a VP assist 605 * page, and should reject CPU onlining if eVMCS is enabled the CPU 606 * doesn't have a VP assist page allocated. 607 */ 608 vp_ap = hv_get_vp_assist_page(smp_processor_id()); 609 if (WARN_ON_ONCE(!vp_ap)) 610 return; 611 612 /* 613 * Reset everything to support using non-enlightened VMCS access later 614 * (e.g. when we reload the module with enlightened_vmcs=0) 615 */ 616 vp_ap->nested_control.features.directhypercall = 0; 617 vp_ap->current_nested_vmcs = 0; 618 vp_ap->enlighten_vmentry = 0; 619 } 620 621 #else /* IS_ENABLED(CONFIG_HYPERV) */ 622 static void hv_init_evmcs(void) {} 623 static void hv_reset_evmcs(void) {} 624 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 625 626 /* 627 * Comment's format: document - errata name - stepping - processor name. 628 * Refer from 629 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 630 */ 631 static u32 vmx_preemption_cpu_tfms[] = { 632 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 633 0x000206E6, 634 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 635 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 636 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 637 0x00020652, 638 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 639 0x00020655, 640 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 641 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 642 /* 643 * 320767.pdf - AAP86 - B1 - 644 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 645 */ 646 0x000106E5, 647 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 648 0x000106A0, 649 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 650 0x000106A1, 651 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 652 0x000106A4, 653 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 654 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 655 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 656 0x000106A5, 657 /* Xeon E3-1220 V2 */ 658 0x000306A8, 659 }; 660 661 static inline bool cpu_has_broken_vmx_preemption_timer(void) 662 { 663 u32 eax = cpuid_eax(0x00000001), i; 664 665 /* Clear the reserved bits */ 666 eax &= ~(0x3U << 14 | 0xfU << 28); 667 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 668 if (eax == vmx_preemption_cpu_tfms[i]) 669 return true; 670 671 return false; 672 } 673 674 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 675 { 676 return flexpriority_enabled && lapic_in_kernel(vcpu); 677 } 678 679 static int possible_passthrough_msr_slot(u32 msr) 680 { 681 u32 i; 682 683 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) 684 if (vmx_possible_passthrough_msrs[i] == msr) 685 return i; 686 687 return -ENOENT; 688 } 689 690 static bool is_valid_passthrough_msr(u32 msr) 691 { 692 bool r; 693 694 switch (msr) { 695 case 0x800 ... 0x8ff: 696 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 697 return true; 698 case MSR_IA32_RTIT_STATUS: 699 case MSR_IA32_RTIT_OUTPUT_BASE: 700 case MSR_IA32_RTIT_OUTPUT_MASK: 701 case MSR_IA32_RTIT_CR3_MATCH: 702 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 703 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ 704 case MSR_LBR_SELECT: 705 case MSR_LBR_TOS: 706 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: 707 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: 708 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: 709 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 710 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 711 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 712 return true; 713 } 714 715 r = possible_passthrough_msr_slot(msr) != -ENOENT; 716 717 WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 718 719 return r; 720 } 721 722 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 723 { 724 int i; 725 726 i = kvm_find_user_return_msr(msr); 727 if (i >= 0) 728 return &vmx->guest_uret_msrs[i]; 729 return NULL; 730 } 731 732 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 733 struct vmx_uret_msr *msr, u64 data) 734 { 735 unsigned int slot = msr - vmx->guest_uret_msrs; 736 int ret = 0; 737 738 if (msr->load_into_hardware) { 739 preempt_disable(); 740 ret = kvm_set_user_return_msr(slot, data, msr->mask); 741 preempt_enable(); 742 } 743 if (!ret) 744 msr->data = data; 745 return ret; 746 } 747 748 /* 749 * Disable VMX and clear CR4.VMXE (even if VMXOFF faults) 750 * 751 * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to 752 * atomically track post-VMXON state, e.g. this may be called in NMI context. 753 * Eat all faults as all other faults on VMXOFF faults are mode related, i.e. 754 * faults are guaranteed to be due to the !post-VMXON check unless the CPU is 755 * magically in RM, VM86, compat mode, or at CPL>0. 756 */ 757 static int kvm_cpu_vmxoff(void) 758 { 759 asm goto("1: vmxoff\n\t" 760 _ASM_EXTABLE(1b, %l[fault]) 761 ::: "cc", "memory" : fault); 762 763 cr4_clear_bits(X86_CR4_VMXE); 764 return 0; 765 766 fault: 767 cr4_clear_bits(X86_CR4_VMXE); 768 return -EIO; 769 } 770 771 static void vmx_emergency_disable(void) 772 { 773 int cpu = raw_smp_processor_id(); 774 struct loaded_vmcs *v; 775 776 kvm_rebooting = true; 777 778 /* 779 * Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be 780 * set in task context. If this races with VMX is disabled by an NMI, 781 * VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to 782 * kvm_rebooting set. 783 */ 784 if (!(__read_cr4() & X86_CR4_VMXE)) 785 return; 786 787 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 788 loaded_vmcss_on_cpu_link) 789 vmcs_clear(v->vmcs); 790 791 kvm_cpu_vmxoff(); 792 } 793 794 static void __loaded_vmcs_clear(void *arg) 795 { 796 struct loaded_vmcs *loaded_vmcs = arg; 797 int cpu = raw_smp_processor_id(); 798 799 if (loaded_vmcs->cpu != cpu) 800 return; /* vcpu migration can race with cpu offline */ 801 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 802 per_cpu(current_vmcs, cpu) = NULL; 803 804 vmcs_clear(loaded_vmcs->vmcs); 805 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 806 vmcs_clear(loaded_vmcs->shadow_vmcs); 807 808 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 809 810 /* 811 * Ensure all writes to loaded_vmcs, including deleting it from its 812 * current percpu list, complete before setting loaded_vmcs->cpu to 813 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 814 * and add loaded_vmcs to its percpu list before it's deleted from this 815 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 816 */ 817 smp_wmb(); 818 819 loaded_vmcs->cpu = -1; 820 loaded_vmcs->launched = 0; 821 } 822 823 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 824 { 825 int cpu = loaded_vmcs->cpu; 826 827 if (cpu != -1) 828 smp_call_function_single(cpu, 829 __loaded_vmcs_clear, loaded_vmcs, 1); 830 } 831 832 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 833 unsigned field) 834 { 835 bool ret; 836 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 837 838 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 839 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 840 vmx->segment_cache.bitmask = 0; 841 } 842 ret = vmx->segment_cache.bitmask & mask; 843 vmx->segment_cache.bitmask |= mask; 844 return ret; 845 } 846 847 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 848 { 849 u16 *p = &vmx->segment_cache.seg[seg].selector; 850 851 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 852 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 853 return *p; 854 } 855 856 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 857 { 858 ulong *p = &vmx->segment_cache.seg[seg].base; 859 860 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 861 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 862 return *p; 863 } 864 865 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 866 { 867 u32 *p = &vmx->segment_cache.seg[seg].limit; 868 869 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 870 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 871 return *p; 872 } 873 874 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 875 { 876 u32 *p = &vmx->segment_cache.seg[seg].ar; 877 878 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 879 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 880 return *p; 881 } 882 883 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 884 { 885 u32 eb; 886 887 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 888 (1u << DB_VECTOR) | (1u << AC_VECTOR); 889 /* 890 * Guest access to VMware backdoor ports could legitimately 891 * trigger #GP because of TSS I/O permission bitmap. 892 * We intercept those #GP and allow access to them anyway 893 * as VMware does. 894 */ 895 if (enable_vmware_backdoor) 896 eb |= (1u << GP_VECTOR); 897 if ((vcpu->guest_debug & 898 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 899 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 900 eb |= 1u << BP_VECTOR; 901 if (to_vmx(vcpu)->rmode.vm86_active) 902 eb = ~0; 903 if (!vmx_need_pf_intercept(vcpu)) 904 eb &= ~(1u << PF_VECTOR); 905 906 /* When we are running a nested L2 guest and L1 specified for it a 907 * certain exception bitmap, we must trap the same exceptions and pass 908 * them to L1. When running L2, we will only handle the exceptions 909 * specified above if L1 did not want them. 910 */ 911 if (is_guest_mode(vcpu)) 912 eb |= get_vmcs12(vcpu)->exception_bitmap; 913 else { 914 int mask = 0, match = 0; 915 916 if (enable_ept && (eb & (1u << PF_VECTOR))) { 917 /* 918 * If EPT is enabled, #PF is currently only intercepted 919 * if MAXPHYADDR is smaller on the guest than on the 920 * host. In that case we only care about present, 921 * non-reserved faults. For vmcs02, however, PFEC_MASK 922 * and PFEC_MATCH are set in prepare_vmcs02_rare. 923 */ 924 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 925 match = PFERR_PRESENT_MASK; 926 } 927 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 928 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 929 } 930 931 /* 932 * Disabling xfd interception indicates that dynamic xfeatures 933 * might be used in the guest. Always trap #NM in this case 934 * to save guest xfd_err timely. 935 */ 936 if (vcpu->arch.xfd_no_write_intercept) 937 eb |= (1u << NM_VECTOR); 938 939 vmcs_write32(EXCEPTION_BITMAP, eb); 940 } 941 942 /* 943 * Check if MSR is intercepted for currently loaded MSR bitmap. 944 */ 945 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 946 { 947 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 948 return true; 949 950 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 951 } 952 953 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 954 { 955 unsigned int flags = 0; 956 957 if (vmx->loaded_vmcs->launched) 958 flags |= VMX_RUN_VMRESUME; 959 960 /* 961 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 962 * to change it directly without causing a vmexit. In that case read 963 * it after vmexit and store it in vmx->spec_ctrl. 964 */ 965 if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)) 966 flags |= VMX_RUN_SAVE_SPEC_CTRL; 967 968 return flags; 969 } 970 971 static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 972 unsigned long entry, unsigned long exit) 973 { 974 vm_entry_controls_clearbit(vmx, entry); 975 vm_exit_controls_clearbit(vmx, exit); 976 } 977 978 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 979 { 980 unsigned int i; 981 982 for (i = 0; i < m->nr; ++i) { 983 if (m->val[i].index == msr) 984 return i; 985 } 986 return -ENOENT; 987 } 988 989 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 990 { 991 int i; 992 struct msr_autoload *m = &vmx->msr_autoload; 993 994 switch (msr) { 995 case MSR_EFER: 996 if (cpu_has_load_ia32_efer()) { 997 clear_atomic_switch_msr_special(vmx, 998 VM_ENTRY_LOAD_IA32_EFER, 999 VM_EXIT_LOAD_IA32_EFER); 1000 return; 1001 } 1002 break; 1003 case MSR_CORE_PERF_GLOBAL_CTRL: 1004 if (cpu_has_load_perf_global_ctrl()) { 1005 clear_atomic_switch_msr_special(vmx, 1006 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1007 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1008 return; 1009 } 1010 break; 1011 } 1012 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1013 if (i < 0) 1014 goto skip_guest; 1015 --m->guest.nr; 1016 m->guest.val[i] = m->guest.val[m->guest.nr]; 1017 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1018 1019 skip_guest: 1020 i = vmx_find_loadstore_msr_slot(&m->host, msr); 1021 if (i < 0) 1022 return; 1023 1024 --m->host.nr; 1025 m->host.val[i] = m->host.val[m->host.nr]; 1026 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1027 } 1028 1029 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1030 unsigned long entry, unsigned long exit, 1031 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1032 u64 guest_val, u64 host_val) 1033 { 1034 vmcs_write64(guest_val_vmcs, guest_val); 1035 if (host_val_vmcs != HOST_IA32_EFER) 1036 vmcs_write64(host_val_vmcs, host_val); 1037 vm_entry_controls_setbit(vmx, entry); 1038 vm_exit_controls_setbit(vmx, exit); 1039 } 1040 1041 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1042 u64 guest_val, u64 host_val, bool entry_only) 1043 { 1044 int i, j = 0; 1045 struct msr_autoload *m = &vmx->msr_autoload; 1046 1047 switch (msr) { 1048 case MSR_EFER: 1049 if (cpu_has_load_ia32_efer()) { 1050 add_atomic_switch_msr_special(vmx, 1051 VM_ENTRY_LOAD_IA32_EFER, 1052 VM_EXIT_LOAD_IA32_EFER, 1053 GUEST_IA32_EFER, 1054 HOST_IA32_EFER, 1055 guest_val, host_val); 1056 return; 1057 } 1058 break; 1059 case MSR_CORE_PERF_GLOBAL_CTRL: 1060 if (cpu_has_load_perf_global_ctrl()) { 1061 add_atomic_switch_msr_special(vmx, 1062 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1063 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1064 GUEST_IA32_PERF_GLOBAL_CTRL, 1065 HOST_IA32_PERF_GLOBAL_CTRL, 1066 guest_val, host_val); 1067 return; 1068 } 1069 break; 1070 case MSR_IA32_PEBS_ENABLE: 1071 /* PEBS needs a quiescent period after being disabled (to write 1072 * a record). Disabling PEBS through VMX MSR swapping doesn't 1073 * provide that period, so a CPU could write host's record into 1074 * guest's memory. 1075 */ 1076 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 1077 } 1078 1079 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1080 if (!entry_only) 1081 j = vmx_find_loadstore_msr_slot(&m->host, msr); 1082 1083 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 1084 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 1085 printk_once(KERN_WARNING "Not enough msr switch entries. " 1086 "Can't add msr %x\n", msr); 1087 return; 1088 } 1089 if (i < 0) { 1090 i = m->guest.nr++; 1091 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1092 } 1093 m->guest.val[i].index = msr; 1094 m->guest.val[i].value = guest_val; 1095 1096 if (entry_only) 1097 return; 1098 1099 if (j < 0) { 1100 j = m->host.nr++; 1101 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1102 } 1103 m->host.val[j].index = msr; 1104 m->host.val[j].value = host_val; 1105 } 1106 1107 static bool update_transition_efer(struct vcpu_vmx *vmx) 1108 { 1109 u64 guest_efer = vmx->vcpu.arch.efer; 1110 u64 ignore_bits = 0; 1111 int i; 1112 1113 /* Shadow paging assumes NX to be available. */ 1114 if (!enable_ept) 1115 guest_efer |= EFER_NX; 1116 1117 /* 1118 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1119 */ 1120 ignore_bits |= EFER_SCE; 1121 #ifdef CONFIG_X86_64 1122 ignore_bits |= EFER_LMA | EFER_LME; 1123 /* SCE is meaningful only in long mode on Intel */ 1124 if (guest_efer & EFER_LMA) 1125 ignore_bits &= ~(u64)EFER_SCE; 1126 #endif 1127 1128 /* 1129 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1130 * On CPUs that support "load IA32_EFER", always switch EFER 1131 * atomically, since it's faster than switching it manually. 1132 */ 1133 if (cpu_has_load_ia32_efer() || 1134 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { 1135 if (!(guest_efer & EFER_LMA)) 1136 guest_efer &= ~EFER_LME; 1137 if (guest_efer != host_efer) 1138 add_atomic_switch_msr(vmx, MSR_EFER, 1139 guest_efer, host_efer, false); 1140 else 1141 clear_atomic_switch_msr(vmx, MSR_EFER); 1142 return false; 1143 } 1144 1145 i = kvm_find_user_return_msr(MSR_EFER); 1146 if (i < 0) 1147 return false; 1148 1149 clear_atomic_switch_msr(vmx, MSR_EFER); 1150 1151 guest_efer &= ~ignore_bits; 1152 guest_efer |= host_efer & ignore_bits; 1153 1154 vmx->guest_uret_msrs[i].data = guest_efer; 1155 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1156 1157 return true; 1158 } 1159 1160 #ifdef CONFIG_X86_32 1161 /* 1162 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1163 * VMCS rather than the segment table. KVM uses this helper to figure 1164 * out the current bases to poke them into the VMCS before entry. 1165 */ 1166 static unsigned long segment_base(u16 selector) 1167 { 1168 struct desc_struct *table; 1169 unsigned long v; 1170 1171 if (!(selector & ~SEGMENT_RPL_MASK)) 1172 return 0; 1173 1174 table = get_current_gdt_ro(); 1175 1176 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1177 u16 ldt_selector = kvm_read_ldt(); 1178 1179 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1180 return 0; 1181 1182 table = (struct desc_struct *)segment_base(ldt_selector); 1183 } 1184 v = get_desc_base(&table[selector >> 3]); 1185 return v; 1186 } 1187 #endif 1188 1189 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1190 { 1191 return vmx_pt_mode_is_host_guest() && 1192 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1193 } 1194 1195 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1196 { 1197 /* The base must be 128-byte aligned and a legal physical address. */ 1198 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1199 } 1200 1201 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1202 { 1203 u32 i; 1204 1205 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1206 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1207 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1208 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1209 for (i = 0; i < addr_range; i++) { 1210 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1211 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1212 } 1213 } 1214 1215 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1216 { 1217 u32 i; 1218 1219 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1220 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1221 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1222 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1223 for (i = 0; i < addr_range; i++) { 1224 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1225 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1226 } 1227 } 1228 1229 static void pt_guest_enter(struct vcpu_vmx *vmx) 1230 { 1231 if (vmx_pt_mode_is_system()) 1232 return; 1233 1234 /* 1235 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1236 * Save host state before VM entry. 1237 */ 1238 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1239 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1240 wrmsrl(MSR_IA32_RTIT_CTL, 0); 1241 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1242 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1243 } 1244 } 1245 1246 static void pt_guest_exit(struct vcpu_vmx *vmx) 1247 { 1248 if (vmx_pt_mode_is_system()) 1249 return; 1250 1251 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1252 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1253 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1254 } 1255 1256 /* 1257 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1258 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1259 */ 1260 if (vmx->pt_desc.host.ctl) 1261 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1262 } 1263 1264 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1265 unsigned long fs_base, unsigned long gs_base) 1266 { 1267 if (unlikely(fs_sel != host->fs_sel)) { 1268 if (!(fs_sel & 7)) 1269 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1270 else 1271 vmcs_write16(HOST_FS_SELECTOR, 0); 1272 host->fs_sel = fs_sel; 1273 } 1274 if (unlikely(gs_sel != host->gs_sel)) { 1275 if (!(gs_sel & 7)) 1276 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1277 else 1278 vmcs_write16(HOST_GS_SELECTOR, 0); 1279 host->gs_sel = gs_sel; 1280 } 1281 if (unlikely(fs_base != host->fs_base)) { 1282 vmcs_writel(HOST_FS_BASE, fs_base); 1283 host->fs_base = fs_base; 1284 } 1285 if (unlikely(gs_base != host->gs_base)) { 1286 vmcs_writel(HOST_GS_BASE, gs_base); 1287 host->gs_base = gs_base; 1288 } 1289 } 1290 1291 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1292 { 1293 struct vcpu_vmx *vmx = to_vmx(vcpu); 1294 struct vmcs_host_state *host_state; 1295 #ifdef CONFIG_X86_64 1296 int cpu = raw_smp_processor_id(); 1297 #endif 1298 unsigned long fs_base, gs_base; 1299 u16 fs_sel, gs_sel; 1300 int i; 1301 1302 vmx->req_immediate_exit = false; 1303 1304 /* 1305 * Note that guest MSRs to be saved/restored can also be changed 1306 * when guest state is loaded. This happens when guest transitions 1307 * to/from long-mode by setting MSR_EFER.LMA. 1308 */ 1309 if (!vmx->guest_uret_msrs_loaded) { 1310 vmx->guest_uret_msrs_loaded = true; 1311 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1312 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1313 continue; 1314 1315 kvm_set_user_return_msr(i, 1316 vmx->guest_uret_msrs[i].data, 1317 vmx->guest_uret_msrs[i].mask); 1318 } 1319 } 1320 1321 if (vmx->nested.need_vmcs12_to_shadow_sync) 1322 nested_sync_vmcs12_to_shadow(vcpu); 1323 1324 if (vmx->guest_state_loaded) 1325 return; 1326 1327 host_state = &vmx->loaded_vmcs->host_state; 1328 1329 /* 1330 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1331 * allow segment selectors with cpl > 0 or ti == 1. 1332 */ 1333 host_state->ldt_sel = kvm_read_ldt(); 1334 1335 #ifdef CONFIG_X86_64 1336 savesegment(ds, host_state->ds_sel); 1337 savesegment(es, host_state->es_sel); 1338 1339 gs_base = cpu_kernelmode_gs_base(cpu); 1340 if (likely(is_64bit_mm(current->mm))) { 1341 current_save_fsgs(); 1342 fs_sel = current->thread.fsindex; 1343 gs_sel = current->thread.gsindex; 1344 fs_base = current->thread.fsbase; 1345 vmx->msr_host_kernel_gs_base = current->thread.gsbase; 1346 } else { 1347 savesegment(fs, fs_sel); 1348 savesegment(gs, gs_sel); 1349 fs_base = read_msr(MSR_FS_BASE); 1350 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1351 } 1352 1353 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1354 #else 1355 savesegment(fs, fs_sel); 1356 savesegment(gs, gs_sel); 1357 fs_base = segment_base(fs_sel); 1358 gs_base = segment_base(gs_sel); 1359 #endif 1360 1361 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1362 vmx->guest_state_loaded = true; 1363 } 1364 1365 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1366 { 1367 struct vmcs_host_state *host_state; 1368 1369 if (!vmx->guest_state_loaded) 1370 return; 1371 1372 host_state = &vmx->loaded_vmcs->host_state; 1373 1374 ++vmx->vcpu.stat.host_state_reload; 1375 1376 #ifdef CONFIG_X86_64 1377 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1378 #endif 1379 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1380 kvm_load_ldt(host_state->ldt_sel); 1381 #ifdef CONFIG_X86_64 1382 load_gs_index(host_state->gs_sel); 1383 #else 1384 loadsegment(gs, host_state->gs_sel); 1385 #endif 1386 } 1387 if (host_state->fs_sel & 7) 1388 loadsegment(fs, host_state->fs_sel); 1389 #ifdef CONFIG_X86_64 1390 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1391 loadsegment(ds, host_state->ds_sel); 1392 loadsegment(es, host_state->es_sel); 1393 } 1394 #endif 1395 invalidate_tss_limit(); 1396 #ifdef CONFIG_X86_64 1397 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1398 #endif 1399 load_fixmap_gdt(raw_smp_processor_id()); 1400 vmx->guest_state_loaded = false; 1401 vmx->guest_uret_msrs_loaded = false; 1402 } 1403 1404 #ifdef CONFIG_X86_64 1405 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1406 { 1407 preempt_disable(); 1408 if (vmx->guest_state_loaded) 1409 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1410 preempt_enable(); 1411 return vmx->msr_guest_kernel_gs_base; 1412 } 1413 1414 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1415 { 1416 preempt_disable(); 1417 if (vmx->guest_state_loaded) 1418 wrmsrl(MSR_KERNEL_GS_BASE, data); 1419 preempt_enable(); 1420 vmx->msr_guest_kernel_gs_base = data; 1421 } 1422 #endif 1423 1424 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, 1425 struct loaded_vmcs *buddy) 1426 { 1427 struct vcpu_vmx *vmx = to_vmx(vcpu); 1428 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1429 struct vmcs *prev; 1430 1431 if (!already_loaded) { 1432 loaded_vmcs_clear(vmx->loaded_vmcs); 1433 local_irq_disable(); 1434 1435 /* 1436 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1437 * this cpu's percpu list, otherwise it may not yet be deleted 1438 * from its previous cpu's percpu list. Pairs with the 1439 * smb_wmb() in __loaded_vmcs_clear(). 1440 */ 1441 smp_rmb(); 1442 1443 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1444 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1445 local_irq_enable(); 1446 } 1447 1448 prev = per_cpu(current_vmcs, cpu); 1449 if (prev != vmx->loaded_vmcs->vmcs) { 1450 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1451 vmcs_load(vmx->loaded_vmcs->vmcs); 1452 1453 /* 1454 * No indirect branch prediction barrier needed when switching 1455 * the active VMCS within a vCPU, unless IBRS is advertised to 1456 * the vCPU. To minimize the number of IBPBs executed, KVM 1457 * performs IBPB on nested VM-Exit (a single nested transition 1458 * may switch the active VMCS multiple times). 1459 */ 1460 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) 1461 indirect_branch_prediction_barrier(); 1462 } 1463 1464 if (!already_loaded) { 1465 void *gdt = get_current_gdt_ro(); 1466 1467 /* 1468 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1469 * TLB entries from its previous association with the vCPU. 1470 */ 1471 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1472 1473 /* 1474 * Linux uses per-cpu TSS and GDT, so set these when switching 1475 * processors. See 22.2.4. 1476 */ 1477 vmcs_writel(HOST_TR_BASE, 1478 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1479 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1480 1481 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1482 /* 22.2.3 */ 1483 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1484 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1485 } 1486 1487 vmx->loaded_vmcs->cpu = cpu; 1488 } 1489 } 1490 1491 /* 1492 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1493 * vcpu mutex is already taken. 1494 */ 1495 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1496 { 1497 struct vcpu_vmx *vmx = to_vmx(vcpu); 1498 1499 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1500 1501 vmx_vcpu_pi_load(vcpu, cpu); 1502 1503 vmx->host_debugctlmsr = get_debugctlmsr(); 1504 } 1505 1506 static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1507 { 1508 vmx_vcpu_pi_put(vcpu); 1509 1510 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1511 } 1512 1513 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1514 { 1515 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1516 } 1517 1518 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1519 { 1520 struct vcpu_vmx *vmx = to_vmx(vcpu); 1521 unsigned long rflags, save_rflags; 1522 1523 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1524 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1525 rflags = vmcs_readl(GUEST_RFLAGS); 1526 if (vmx->rmode.vm86_active) { 1527 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1528 save_rflags = vmx->rmode.save_rflags; 1529 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1530 } 1531 vmx->rflags = rflags; 1532 } 1533 return vmx->rflags; 1534 } 1535 1536 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1537 { 1538 struct vcpu_vmx *vmx = to_vmx(vcpu); 1539 unsigned long old_rflags; 1540 1541 /* 1542 * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU 1543 * is an unrestricted guest in order to mark L2 as needing emulation 1544 * if L1 runs L2 as a restricted guest. 1545 */ 1546 if (is_unrestricted_guest(vcpu)) { 1547 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1548 vmx->rflags = rflags; 1549 vmcs_writel(GUEST_RFLAGS, rflags); 1550 return; 1551 } 1552 1553 old_rflags = vmx_get_rflags(vcpu); 1554 vmx->rflags = rflags; 1555 if (vmx->rmode.vm86_active) { 1556 vmx->rmode.save_rflags = rflags; 1557 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1558 } 1559 vmcs_writel(GUEST_RFLAGS, rflags); 1560 1561 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1562 vmx->emulation_required = vmx_emulation_required(vcpu); 1563 } 1564 1565 static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1566 { 1567 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1568 } 1569 1570 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1571 { 1572 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1573 int ret = 0; 1574 1575 if (interruptibility & GUEST_INTR_STATE_STI) 1576 ret |= KVM_X86_SHADOW_INT_STI; 1577 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1578 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1579 1580 return ret; 1581 } 1582 1583 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1584 { 1585 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1586 u32 interruptibility = interruptibility_old; 1587 1588 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1589 1590 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1591 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1592 else if (mask & KVM_X86_SHADOW_INT_STI) 1593 interruptibility |= GUEST_INTR_STATE_STI; 1594 1595 if ((interruptibility != interruptibility_old)) 1596 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1597 } 1598 1599 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1600 { 1601 struct vcpu_vmx *vmx = to_vmx(vcpu); 1602 unsigned long value; 1603 1604 /* 1605 * Any MSR write that attempts to change bits marked reserved will 1606 * case a #GP fault. 1607 */ 1608 if (data & vmx->pt_desc.ctl_bitmask) 1609 return 1; 1610 1611 /* 1612 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1613 * result in a #GP unless the same write also clears TraceEn. 1614 */ 1615 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1616 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) 1617 return 1; 1618 1619 /* 1620 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1621 * and FabricEn would cause #GP, if 1622 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1623 */ 1624 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1625 !(data & RTIT_CTL_FABRIC_EN) && 1626 !intel_pt_validate_cap(vmx->pt_desc.caps, 1627 PT_CAP_single_range_output)) 1628 return 1; 1629 1630 /* 1631 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1632 * utilize encodings marked reserved will cause a #GP fault. 1633 */ 1634 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1635 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1636 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1637 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1638 return 1; 1639 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1640 PT_CAP_cycle_thresholds); 1641 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1642 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1643 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1644 return 1; 1645 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1646 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1647 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1648 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1649 return 1; 1650 1651 /* 1652 * If ADDRx_CFG is reserved or the encodings is >2 will 1653 * cause a #GP fault. 1654 */ 1655 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1656 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1657 return 1; 1658 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1659 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1660 return 1; 1661 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1662 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1663 return 1; 1664 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1665 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1666 return 1; 1667 1668 return 0; 1669 } 1670 1671 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1672 void *insn, int insn_len) 1673 { 1674 /* 1675 * Emulation of instructions in SGX enclaves is impossible as RIP does 1676 * not point at the failing instruction, and even if it did, the code 1677 * stream is inaccessible. Inject #UD instead of exiting to userspace 1678 * so that guest userspace can't DoS the guest simply by triggering 1679 * emulation (enclaves are CPL3 only). 1680 */ 1681 if (to_vmx(vcpu)->exit_reason.enclave_mode) { 1682 kvm_queue_exception(vcpu, UD_VECTOR); 1683 return false; 1684 } 1685 return true; 1686 } 1687 1688 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1689 { 1690 union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; 1691 unsigned long rip, orig_rip; 1692 u32 instr_len; 1693 1694 /* 1695 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1696 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1697 * set when EPT misconfig occurs. In practice, real hardware updates 1698 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1699 * (namely Hyper-V) don't set it due to it being undefined behavior, 1700 * i.e. we end up advancing IP with some random value. 1701 */ 1702 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1703 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1704 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1705 1706 /* 1707 * Emulating an enclave's instructions isn't supported as KVM 1708 * cannot access the enclave's memory or its true RIP, e.g. the 1709 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1710 * the RIP that actually triggered the VM-Exit. But, because 1711 * most instructions that cause VM-Exit will #UD in an enclave, 1712 * most instruction-based VM-Exits simply do not occur. 1713 * 1714 * There are a few exceptions, notably the debug instructions 1715 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1716 * and generate #DB/#BP as expected, which KVM might intercept. 1717 * But again, the CPU does the dirty work and saves an instr 1718 * length of zero so VMMs don't shoot themselves in the foot. 1719 * WARN if KVM tries to skip a non-zero length instruction on 1720 * a VM-Exit from an enclave. 1721 */ 1722 if (!instr_len) 1723 goto rip_updated; 1724 1725 WARN_ONCE(exit_reason.enclave_mode, 1726 "skipping instruction after SGX enclave VM-Exit"); 1727 1728 orig_rip = kvm_rip_read(vcpu); 1729 rip = orig_rip + instr_len; 1730 #ifdef CONFIG_X86_64 1731 /* 1732 * We need to mask out the high 32 bits of RIP if not in 64-bit 1733 * mode, but just finding out that we are in 64-bit mode is 1734 * quite expensive. Only do it if there was a carry. 1735 */ 1736 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1737 rip = (u32)rip; 1738 #endif 1739 kvm_rip_write(vcpu, rip); 1740 } else { 1741 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1742 return 0; 1743 } 1744 1745 rip_updated: 1746 /* skipping an emulated instruction also counts */ 1747 vmx_set_interrupt_shadow(vcpu, 0); 1748 1749 return 1; 1750 } 1751 1752 /* 1753 * Recognizes a pending MTF VM-exit and records the nested state for later 1754 * delivery. 1755 */ 1756 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1757 { 1758 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1759 struct vcpu_vmx *vmx = to_vmx(vcpu); 1760 1761 if (!is_guest_mode(vcpu)) 1762 return; 1763 1764 /* 1765 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1766 * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps 1767 * or ICEBP (in the emulator proper), and skipping of ICEBP after an 1768 * intercepted #DB deliberately avoids single-step #DB and MTF updates 1769 * as ICEBP is higher priority than both. As instruction emulation is 1770 * completed at this point (i.e. KVM is at the instruction boundary), 1771 * any #DB exception pending delivery must be a debug-trap of lower 1772 * priority than MTF. Record the pending MTF state to be delivered in 1773 * vmx_check_nested_events(). 1774 */ 1775 if (nested_cpu_has_mtf(vmcs12) && 1776 (!vcpu->arch.exception.pending || 1777 vcpu->arch.exception.vector == DB_VECTOR) && 1778 (!vcpu->arch.exception_vmexit.pending || 1779 vcpu->arch.exception_vmexit.vector == DB_VECTOR)) { 1780 vmx->nested.mtf_pending = true; 1781 kvm_make_request(KVM_REQ_EVENT, vcpu); 1782 } else { 1783 vmx->nested.mtf_pending = false; 1784 } 1785 } 1786 1787 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1788 { 1789 vmx_update_emulated_instruction(vcpu); 1790 return skip_emulated_instruction(vcpu); 1791 } 1792 1793 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1794 { 1795 /* 1796 * Ensure that we clear the HLT state in the VMCS. We don't need to 1797 * explicitly skip the instruction because if the HLT state is set, 1798 * then the instruction is already executing and RIP has already been 1799 * advanced. 1800 */ 1801 if (kvm_hlt_in_guest(vcpu->kvm) && 1802 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1803 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1804 } 1805 1806 static void vmx_inject_exception(struct kvm_vcpu *vcpu) 1807 { 1808 struct kvm_queued_exception *ex = &vcpu->arch.exception; 1809 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 1810 struct vcpu_vmx *vmx = to_vmx(vcpu); 1811 1812 kvm_deliver_exception_payload(vcpu, ex); 1813 1814 if (ex->has_error_code) { 1815 /* 1816 * Despite the error code being architecturally defined as 32 1817 * bits, and the VMCS field being 32 bits, Intel CPUs and thus 1818 * VMX don't actually supporting setting bits 31:16. Hardware 1819 * will (should) never provide a bogus error code, but AMD CPUs 1820 * do generate error codes with bits 31:16 set, and so KVM's 1821 * ABI lets userspace shove in arbitrary 32-bit values. Drop 1822 * the upper bits to avoid VM-Fail, losing information that 1823 * does't really exist is preferable to killing the VM. 1824 */ 1825 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code); 1826 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1827 } 1828 1829 if (vmx->rmode.vm86_active) { 1830 int inc_eip = 0; 1831 if (kvm_exception_is_soft(ex->vector)) 1832 inc_eip = vcpu->arch.event_exit_inst_len; 1833 kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip); 1834 return; 1835 } 1836 1837 WARN_ON_ONCE(vmx->emulation_required); 1838 1839 if (kvm_exception_is_soft(ex->vector)) { 1840 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1841 vmx->vcpu.arch.event_exit_inst_len); 1842 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1843 } else 1844 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1845 1846 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1847 1848 vmx_clear_hlt(vcpu); 1849 } 1850 1851 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1852 bool load_into_hardware) 1853 { 1854 struct vmx_uret_msr *uret_msr; 1855 1856 uret_msr = vmx_find_uret_msr(vmx, msr); 1857 if (!uret_msr) 1858 return; 1859 1860 uret_msr->load_into_hardware = load_into_hardware; 1861 } 1862 1863 /* 1864 * Configuring user return MSRs to automatically save, load, and restore MSRs 1865 * that need to be shoved into hardware when running the guest. Note, omitting 1866 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1867 * loaded into hardware when running the guest. 1868 */ 1869 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1870 { 1871 #ifdef CONFIG_X86_64 1872 bool load_syscall_msrs; 1873 1874 /* 1875 * The SYSCALL MSRs are only needed on long mode guests, and only 1876 * when EFER.SCE is set. 1877 */ 1878 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1879 (vmx->vcpu.arch.efer & EFER_SCE); 1880 1881 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1882 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1883 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1884 #endif 1885 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1886 1887 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1888 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1889 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1890 1891 /* 1892 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1893 * kernel and old userspace. If those guests run on a tsx=off host, do 1894 * allow guests to use TSX_CTRL, but don't change the value in hardware 1895 * so that TSX remains always disabled. 1896 */ 1897 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1898 1899 /* 1900 * The set of MSRs to load may have changed, reload MSRs before the 1901 * next VM-Enter. 1902 */ 1903 vmx->guest_uret_msrs_loaded = false; 1904 } 1905 1906 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1907 { 1908 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1909 1910 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 1911 return vmcs12->tsc_offset; 1912 1913 return 0; 1914 } 1915 1916 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1917 { 1918 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1919 1920 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 1921 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 1922 return vmcs12->tsc_multiplier; 1923 1924 return kvm_caps.default_tsc_scaling_ratio; 1925 } 1926 1927 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu) 1928 { 1929 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 1930 } 1931 1932 static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu) 1933 { 1934 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 1935 } 1936 1937 /* 1938 * Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of 1939 * guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain 1940 * backwards compatibility even though KVM doesn't support emulating SMX. And 1941 * because userspace set "VMX in SMX", the guest must also be allowed to set it, 1942 * e.g. if the MSR is left unlocked and the guest does a RMW operation. 1943 */ 1944 #define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED | \ 1945 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | \ 1946 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX | \ 1947 FEAT_CTL_SGX_LC_ENABLED | \ 1948 FEAT_CTL_SGX_ENABLED | \ 1949 FEAT_CTL_LMCE_ENABLED) 1950 1951 static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx, 1952 struct msr_data *msr) 1953 { 1954 uint64_t valid_bits; 1955 1956 /* 1957 * Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are 1958 * exposed to the guest. 1959 */ 1960 WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits & 1961 ~KVM_SUPPORTED_FEATURE_CONTROL); 1962 1963 if (!msr->host_initiated && 1964 (vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED)) 1965 return false; 1966 1967 if (msr->host_initiated) 1968 valid_bits = KVM_SUPPORTED_FEATURE_CONTROL; 1969 else 1970 valid_bits = vmx->msr_ia32_feature_control_valid_bits; 1971 1972 return !(msr->data & ~valid_bits); 1973 } 1974 1975 static int vmx_get_msr_feature(struct kvm_msr_entry *msr) 1976 { 1977 switch (msr->index) { 1978 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 1979 if (!nested) 1980 return 1; 1981 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); 1982 default: 1983 return KVM_MSR_RET_INVALID; 1984 } 1985 } 1986 1987 /* 1988 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 1989 * Returns 0 on success, non-0 otherwise. 1990 * Assumes vcpu_load() was already called. 1991 */ 1992 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1993 { 1994 struct vcpu_vmx *vmx = to_vmx(vcpu); 1995 struct vmx_uret_msr *msr; 1996 u32 index; 1997 1998 switch (msr_info->index) { 1999 #ifdef CONFIG_X86_64 2000 case MSR_FS_BASE: 2001 msr_info->data = vmcs_readl(GUEST_FS_BASE); 2002 break; 2003 case MSR_GS_BASE: 2004 msr_info->data = vmcs_readl(GUEST_GS_BASE); 2005 break; 2006 case MSR_KERNEL_GS_BASE: 2007 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 2008 break; 2009 #endif 2010 case MSR_EFER: 2011 return kvm_get_msr_common(vcpu, msr_info); 2012 case MSR_IA32_TSX_CTRL: 2013 if (!msr_info->host_initiated && 2014 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2015 return 1; 2016 goto find_uret_msr; 2017 case MSR_IA32_UMWAIT_CONTROL: 2018 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2019 return 1; 2020 2021 msr_info->data = vmx->msr_ia32_umwait_control; 2022 break; 2023 case MSR_IA32_SPEC_CTRL: 2024 if (!msr_info->host_initiated && 2025 !guest_has_spec_ctrl_msr(vcpu)) 2026 return 1; 2027 2028 msr_info->data = to_vmx(vcpu)->spec_ctrl; 2029 break; 2030 case MSR_IA32_SYSENTER_CS: 2031 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 2032 break; 2033 case MSR_IA32_SYSENTER_EIP: 2034 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 2035 break; 2036 case MSR_IA32_SYSENTER_ESP: 2037 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 2038 break; 2039 case MSR_IA32_BNDCFGS: 2040 if (!kvm_mpx_supported() || 2041 (!msr_info->host_initiated && 2042 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 2043 return 1; 2044 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 2045 break; 2046 case MSR_IA32_MCG_EXT_CTL: 2047 if (!msr_info->host_initiated && 2048 !(vmx->msr_ia32_feature_control & 2049 FEAT_CTL_LMCE_ENABLED)) 2050 return 1; 2051 msr_info->data = vcpu->arch.mcg_ext_ctl; 2052 break; 2053 case MSR_IA32_FEAT_CTL: 2054 msr_info->data = vmx->msr_ia32_feature_control; 2055 break; 2056 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2057 if (!msr_info->host_initiated && 2058 !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) 2059 return 1; 2060 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 2061 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 2062 break; 2063 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2064 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 2065 return 1; 2066 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 2067 &msr_info->data)) 2068 return 1; 2069 /* 2070 * Enlightened VMCS v1 doesn't have certain VMCS fields but 2071 * instead of just ignoring the features, different Hyper-V 2072 * versions are either trying to use them and fail or do some 2073 * sanity checking and refuse to boot. Filter all unsupported 2074 * features out. 2075 */ 2076 if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu)) 2077 nested_evmcs_filter_control_msr(vcpu, msr_info->index, 2078 &msr_info->data); 2079 break; 2080 case MSR_IA32_RTIT_CTL: 2081 if (!vmx_pt_mode_is_host_guest()) 2082 return 1; 2083 msr_info->data = vmx->pt_desc.guest.ctl; 2084 break; 2085 case MSR_IA32_RTIT_STATUS: 2086 if (!vmx_pt_mode_is_host_guest()) 2087 return 1; 2088 msr_info->data = vmx->pt_desc.guest.status; 2089 break; 2090 case MSR_IA32_RTIT_CR3_MATCH: 2091 if (!vmx_pt_mode_is_host_guest() || 2092 !intel_pt_validate_cap(vmx->pt_desc.caps, 2093 PT_CAP_cr3_filtering)) 2094 return 1; 2095 msr_info->data = vmx->pt_desc.guest.cr3_match; 2096 break; 2097 case MSR_IA32_RTIT_OUTPUT_BASE: 2098 if (!vmx_pt_mode_is_host_guest() || 2099 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2100 PT_CAP_topa_output) && 2101 !intel_pt_validate_cap(vmx->pt_desc.caps, 2102 PT_CAP_single_range_output))) 2103 return 1; 2104 msr_info->data = vmx->pt_desc.guest.output_base; 2105 break; 2106 case MSR_IA32_RTIT_OUTPUT_MASK: 2107 if (!vmx_pt_mode_is_host_guest() || 2108 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2109 PT_CAP_topa_output) && 2110 !intel_pt_validate_cap(vmx->pt_desc.caps, 2111 PT_CAP_single_range_output))) 2112 return 1; 2113 msr_info->data = vmx->pt_desc.guest.output_mask; 2114 break; 2115 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2116 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2117 if (!vmx_pt_mode_is_host_guest() || 2118 (index >= 2 * vmx->pt_desc.num_address_ranges)) 2119 return 1; 2120 if (index % 2) 2121 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 2122 else 2123 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 2124 break; 2125 case MSR_IA32_DEBUGCTLMSR: 2126 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 2127 break; 2128 default: 2129 find_uret_msr: 2130 msr = vmx_find_uret_msr(vmx, msr_info->index); 2131 if (msr) { 2132 msr_info->data = msr->data; 2133 break; 2134 } 2135 return kvm_get_msr_common(vcpu, msr_info); 2136 } 2137 2138 return 0; 2139 } 2140 2141 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2142 u64 data) 2143 { 2144 #ifdef CONFIG_X86_64 2145 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) 2146 return (u32)data; 2147 #endif 2148 return (unsigned long)data; 2149 } 2150 2151 static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated) 2152 { 2153 u64 debugctl = 0; 2154 2155 if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) && 2156 (host_initiated || guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))) 2157 debugctl |= DEBUGCTLMSR_BUS_LOCK_DETECT; 2158 2159 if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) && 2160 (host_initiated || intel_pmu_lbr_is_enabled(vcpu))) 2161 debugctl |= DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; 2162 2163 return debugctl; 2164 } 2165 2166 /* 2167 * Writes msr value into the appropriate "register". 2168 * Returns 0 on success, non-0 otherwise. 2169 * Assumes vcpu_load() was already called. 2170 */ 2171 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2172 { 2173 struct vcpu_vmx *vmx = to_vmx(vcpu); 2174 struct vmx_uret_msr *msr; 2175 int ret = 0; 2176 u32 msr_index = msr_info->index; 2177 u64 data = msr_info->data; 2178 u32 index; 2179 2180 switch (msr_index) { 2181 case MSR_EFER: 2182 ret = kvm_set_msr_common(vcpu, msr_info); 2183 break; 2184 #ifdef CONFIG_X86_64 2185 case MSR_FS_BASE: 2186 vmx_segment_cache_clear(vmx); 2187 vmcs_writel(GUEST_FS_BASE, data); 2188 break; 2189 case MSR_GS_BASE: 2190 vmx_segment_cache_clear(vmx); 2191 vmcs_writel(GUEST_GS_BASE, data); 2192 break; 2193 case MSR_KERNEL_GS_BASE: 2194 vmx_write_guest_kernel_gs_base(vmx, data); 2195 break; 2196 case MSR_IA32_XFD: 2197 ret = kvm_set_msr_common(vcpu, msr_info); 2198 /* 2199 * Always intercepting WRMSR could incur non-negligible 2200 * overhead given xfd might be changed frequently in 2201 * guest context switch. Disable write interception 2202 * upon the first write with a non-zero value (indicating 2203 * potential usage on dynamic xfeatures). Also update 2204 * exception bitmap to trap #NM for proper virtualization 2205 * of guest xfd_err. 2206 */ 2207 if (!ret && data) { 2208 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2209 MSR_TYPE_RW); 2210 vcpu->arch.xfd_no_write_intercept = true; 2211 vmx_update_exception_bitmap(vcpu); 2212 } 2213 break; 2214 #endif 2215 case MSR_IA32_SYSENTER_CS: 2216 if (is_guest_mode(vcpu)) 2217 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2218 vmcs_write32(GUEST_SYSENTER_CS, data); 2219 break; 2220 case MSR_IA32_SYSENTER_EIP: 2221 if (is_guest_mode(vcpu)) { 2222 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2223 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2224 } 2225 vmcs_writel(GUEST_SYSENTER_EIP, data); 2226 break; 2227 case MSR_IA32_SYSENTER_ESP: 2228 if (is_guest_mode(vcpu)) { 2229 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2230 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2231 } 2232 vmcs_writel(GUEST_SYSENTER_ESP, data); 2233 break; 2234 case MSR_IA32_DEBUGCTLMSR: { 2235 u64 invalid; 2236 2237 invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated); 2238 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2239 kvm_pr_unimpl_wrmsr(vcpu, msr_index, data); 2240 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2241 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2242 } 2243 2244 if (invalid) 2245 return 1; 2246 2247 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2248 VM_EXIT_SAVE_DEBUG_CONTROLS) 2249 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2250 2251 vmcs_write64(GUEST_IA32_DEBUGCTL, data); 2252 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2253 (data & DEBUGCTLMSR_LBR)) 2254 intel_pmu_create_guest_lbr_event(vcpu); 2255 return 0; 2256 } 2257 case MSR_IA32_BNDCFGS: 2258 if (!kvm_mpx_supported() || 2259 (!msr_info->host_initiated && 2260 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 2261 return 1; 2262 if (is_noncanonical_address(data & PAGE_MASK, vcpu) || 2263 (data & MSR_IA32_BNDCFGS_RSVD)) 2264 return 1; 2265 2266 if (is_guest_mode(vcpu) && 2267 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2268 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2269 get_vmcs12(vcpu)->guest_bndcfgs = data; 2270 2271 vmcs_write64(GUEST_BNDCFGS, data); 2272 break; 2273 case MSR_IA32_UMWAIT_CONTROL: 2274 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2275 return 1; 2276 2277 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2278 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2279 return 1; 2280 2281 vmx->msr_ia32_umwait_control = data; 2282 break; 2283 case MSR_IA32_SPEC_CTRL: 2284 if (!msr_info->host_initiated && 2285 !guest_has_spec_ctrl_msr(vcpu)) 2286 return 1; 2287 2288 if (kvm_spec_ctrl_test_value(data)) 2289 return 1; 2290 2291 vmx->spec_ctrl = data; 2292 if (!data) 2293 break; 2294 2295 /* 2296 * For non-nested: 2297 * When it's written (to non-zero) for the first time, pass 2298 * it through. 2299 * 2300 * For nested: 2301 * The handling of the MSR bitmap for L2 guests is done in 2302 * nested_vmx_prepare_msr_bitmap. We should not touch the 2303 * vmcs02.msr_bitmap here since it gets completely overwritten 2304 * in the merging. We update the vmcs01 here for L1 as well 2305 * since it will end up touching the MSR anyway now. 2306 */ 2307 vmx_disable_intercept_for_msr(vcpu, 2308 MSR_IA32_SPEC_CTRL, 2309 MSR_TYPE_RW); 2310 break; 2311 case MSR_IA32_TSX_CTRL: 2312 if (!msr_info->host_initiated && 2313 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2314 return 1; 2315 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2316 return 1; 2317 goto find_uret_msr; 2318 case MSR_IA32_CR_PAT: 2319 ret = kvm_set_msr_common(vcpu, msr_info); 2320 if (ret) 2321 break; 2322 2323 if (is_guest_mode(vcpu) && 2324 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2325 get_vmcs12(vcpu)->guest_ia32_pat = data; 2326 2327 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 2328 vmcs_write64(GUEST_IA32_PAT, data); 2329 break; 2330 case MSR_IA32_MCG_EXT_CTL: 2331 if ((!msr_info->host_initiated && 2332 !(to_vmx(vcpu)->msr_ia32_feature_control & 2333 FEAT_CTL_LMCE_ENABLED)) || 2334 (data & ~MCG_EXT_CTL_LMCE_EN)) 2335 return 1; 2336 vcpu->arch.mcg_ext_ctl = data; 2337 break; 2338 case MSR_IA32_FEAT_CTL: 2339 if (!is_vmx_feature_control_msr_valid(vmx, msr_info)) 2340 return 1; 2341 2342 vmx->msr_ia32_feature_control = data; 2343 if (msr_info->host_initiated && data == 0) 2344 vmx_leave_nested(vcpu); 2345 2346 /* SGX may be enabled/disabled by guest's firmware */ 2347 vmx_write_encls_bitmap(vcpu, NULL); 2348 break; 2349 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2350 /* 2351 * On real hardware, the LE hash MSRs are writable before 2352 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2353 * at which point SGX related bits in IA32_FEATURE_CONTROL 2354 * become writable. 2355 * 2356 * KVM does not emulate SGX activation for simplicity, so 2357 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2358 * is unlocked. This is technically not architectural 2359 * behavior, but it's close enough. 2360 */ 2361 if (!msr_info->host_initiated && 2362 (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || 2363 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2364 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2365 return 1; 2366 vmx->msr_ia32_sgxlepubkeyhash 2367 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2368 break; 2369 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 2370 if (!msr_info->host_initiated) 2371 return 1; /* they are read-only */ 2372 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 2373 return 1; 2374 return vmx_set_vmx_msr(vcpu, msr_index, data); 2375 case MSR_IA32_RTIT_CTL: 2376 if (!vmx_pt_mode_is_host_guest() || 2377 vmx_rtit_ctl_check(vcpu, data) || 2378 vmx->nested.vmxon) 2379 return 1; 2380 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2381 vmx->pt_desc.guest.ctl = data; 2382 pt_update_intercept_for_msr(vcpu); 2383 break; 2384 case MSR_IA32_RTIT_STATUS: 2385 if (!pt_can_write_msr(vmx)) 2386 return 1; 2387 if (data & MSR_IA32_RTIT_STATUS_MASK) 2388 return 1; 2389 vmx->pt_desc.guest.status = data; 2390 break; 2391 case MSR_IA32_RTIT_CR3_MATCH: 2392 if (!pt_can_write_msr(vmx)) 2393 return 1; 2394 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2395 PT_CAP_cr3_filtering)) 2396 return 1; 2397 vmx->pt_desc.guest.cr3_match = data; 2398 break; 2399 case MSR_IA32_RTIT_OUTPUT_BASE: 2400 if (!pt_can_write_msr(vmx)) 2401 return 1; 2402 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2403 PT_CAP_topa_output) && 2404 !intel_pt_validate_cap(vmx->pt_desc.caps, 2405 PT_CAP_single_range_output)) 2406 return 1; 2407 if (!pt_output_base_valid(vcpu, data)) 2408 return 1; 2409 vmx->pt_desc.guest.output_base = data; 2410 break; 2411 case MSR_IA32_RTIT_OUTPUT_MASK: 2412 if (!pt_can_write_msr(vmx)) 2413 return 1; 2414 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2415 PT_CAP_topa_output) && 2416 !intel_pt_validate_cap(vmx->pt_desc.caps, 2417 PT_CAP_single_range_output)) 2418 return 1; 2419 vmx->pt_desc.guest.output_mask = data; 2420 break; 2421 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2422 if (!pt_can_write_msr(vmx)) 2423 return 1; 2424 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2425 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2426 return 1; 2427 if (is_noncanonical_address(data, vcpu)) 2428 return 1; 2429 if (index % 2) 2430 vmx->pt_desc.guest.addr_b[index / 2] = data; 2431 else 2432 vmx->pt_desc.guest.addr_a[index / 2] = data; 2433 break; 2434 case MSR_IA32_PERF_CAPABILITIES: 2435 if (data && !vcpu_to_pmu(vcpu)->version) 2436 return 1; 2437 if (data & PMU_CAP_LBR_FMT) { 2438 if ((data & PMU_CAP_LBR_FMT) != 2439 (kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT)) 2440 return 1; 2441 if (!cpuid_model_is_consistent(vcpu)) 2442 return 1; 2443 } 2444 if (data & PERF_CAP_PEBS_FORMAT) { 2445 if ((data & PERF_CAP_PEBS_MASK) != 2446 (kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK)) 2447 return 1; 2448 if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) 2449 return 1; 2450 if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) 2451 return 1; 2452 if (!cpuid_model_is_consistent(vcpu)) 2453 return 1; 2454 } 2455 ret = kvm_set_msr_common(vcpu, msr_info); 2456 break; 2457 2458 default: 2459 find_uret_msr: 2460 msr = vmx_find_uret_msr(vmx, msr_index); 2461 if (msr) 2462 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2463 else 2464 ret = kvm_set_msr_common(vcpu, msr_info); 2465 } 2466 2467 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2468 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2469 vmx_update_fb_clear_dis(vcpu, vmx); 2470 2471 return ret; 2472 } 2473 2474 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2475 { 2476 unsigned long guest_owned_bits; 2477 2478 kvm_register_mark_available(vcpu, reg); 2479 2480 switch (reg) { 2481 case VCPU_REGS_RSP: 2482 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2483 break; 2484 case VCPU_REGS_RIP: 2485 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2486 break; 2487 case VCPU_EXREG_PDPTR: 2488 if (enable_ept) 2489 ept_save_pdptrs(vcpu); 2490 break; 2491 case VCPU_EXREG_CR0: 2492 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2493 2494 vcpu->arch.cr0 &= ~guest_owned_bits; 2495 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2496 break; 2497 case VCPU_EXREG_CR3: 2498 /* 2499 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2500 * CR3 is loaded into hardware, not the guest's CR3. 2501 */ 2502 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2503 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2504 break; 2505 case VCPU_EXREG_CR4: 2506 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2507 2508 vcpu->arch.cr4 &= ~guest_owned_bits; 2509 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2510 break; 2511 default: 2512 KVM_BUG_ON(1, vcpu->kvm); 2513 break; 2514 } 2515 } 2516 2517 /* 2518 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2519 * directly instead of going through cpu_has(), to ensure KVM is trapping 2520 * ENCLS whenever it's supported in hardware. It does not matter whether 2521 * the host OS supports or has enabled SGX. 2522 */ 2523 static bool cpu_has_sgx(void) 2524 { 2525 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2526 } 2527 2528 /* 2529 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2530 * can't be used due to errata where VM Exit may incorrectly clear 2531 * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the 2532 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2533 */ 2534 static bool cpu_has_perf_global_ctrl_bug(void) 2535 { 2536 if (boot_cpu_data.x86 == 0x6) { 2537 switch (boot_cpu_data.x86_model) { 2538 case INTEL_FAM6_NEHALEM_EP: /* AAK155 */ 2539 case INTEL_FAM6_NEHALEM: /* AAP115 */ 2540 case INTEL_FAM6_WESTMERE: /* AAT100 */ 2541 case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */ 2542 case INTEL_FAM6_NEHALEM_EX: /* BA97 */ 2543 return true; 2544 default: 2545 break; 2546 } 2547 } 2548 2549 return false; 2550 } 2551 2552 static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) 2553 { 2554 u32 vmx_msr_low, vmx_msr_high; 2555 u32 ctl = ctl_min | ctl_opt; 2556 2557 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2558 2559 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2560 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2561 2562 /* Ensure minimum (required) set of control bits are supported. */ 2563 if (ctl_min & ~ctl) 2564 return -EIO; 2565 2566 *result = ctl; 2567 return 0; 2568 } 2569 2570 static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2571 { 2572 u64 allowed; 2573 2574 rdmsrl(msr, allowed); 2575 2576 return ctl_opt & allowed; 2577 } 2578 2579 static int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2580 struct vmx_capability *vmx_cap) 2581 { 2582 u32 vmx_msr_low, vmx_msr_high; 2583 u32 _pin_based_exec_control = 0; 2584 u32 _cpu_based_exec_control = 0; 2585 u32 _cpu_based_2nd_exec_control = 0; 2586 u64 _cpu_based_3rd_exec_control = 0; 2587 u32 _vmexit_control = 0; 2588 u32 _vmentry_control = 0; 2589 u64 misc_msr; 2590 int i; 2591 2592 /* 2593 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2594 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2595 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2596 */ 2597 struct { 2598 u32 entry_control; 2599 u32 exit_control; 2600 } const vmcs_entry_exit_pairs[] = { 2601 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2602 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2603 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2604 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2605 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2606 }; 2607 2608 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2609 2610 if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL, 2611 KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL, 2612 MSR_IA32_VMX_PROCBASED_CTLS, 2613 &_cpu_based_exec_control)) 2614 return -EIO; 2615 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2616 if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL, 2617 KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL, 2618 MSR_IA32_VMX_PROCBASED_CTLS2, 2619 &_cpu_based_2nd_exec_control)) 2620 return -EIO; 2621 } 2622 #ifndef CONFIG_X86_64 2623 if (!(_cpu_based_2nd_exec_control & 2624 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2625 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2626 #endif 2627 2628 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2629 _cpu_based_2nd_exec_control &= ~( 2630 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2631 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2632 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2633 2634 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2635 &vmx_cap->ept, &vmx_cap->vpid); 2636 2637 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 2638 vmx_cap->ept) { 2639 pr_warn_once("EPT CAP should not exist if not support " 2640 "1-setting enable EPT VM-execution control\n"); 2641 2642 if (error_on_inconsistent_vmcs_config) 2643 return -EIO; 2644 2645 vmx_cap->ept = 0; 2646 } 2647 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2648 vmx_cap->vpid) { 2649 pr_warn_once("VPID CAP should not exist if not support " 2650 "1-setting enable VPID VM-execution control\n"); 2651 2652 if (error_on_inconsistent_vmcs_config) 2653 return -EIO; 2654 2655 vmx_cap->vpid = 0; 2656 } 2657 2658 if (!cpu_has_sgx()) 2659 _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING; 2660 2661 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) 2662 _cpu_based_3rd_exec_control = 2663 adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL, 2664 MSR_IA32_VMX_PROCBASED_CTLS3); 2665 2666 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS, 2667 KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS, 2668 MSR_IA32_VMX_EXIT_CTLS, 2669 &_vmexit_control)) 2670 return -EIO; 2671 2672 if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL, 2673 KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL, 2674 MSR_IA32_VMX_PINBASED_CTLS, 2675 &_pin_based_exec_control)) 2676 return -EIO; 2677 2678 if (cpu_has_broken_vmx_preemption_timer()) 2679 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2680 if (!(_cpu_based_2nd_exec_control & 2681 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2682 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2683 2684 if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS, 2685 KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS, 2686 MSR_IA32_VMX_ENTRY_CTLS, 2687 &_vmentry_control)) 2688 return -EIO; 2689 2690 for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { 2691 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; 2692 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; 2693 2694 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) 2695 continue; 2696 2697 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", 2698 _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); 2699 2700 if (error_on_inconsistent_vmcs_config) 2701 return -EIO; 2702 2703 _vmentry_control &= ~n_ctrl; 2704 _vmexit_control &= ~x_ctrl; 2705 } 2706 2707 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 2708 2709 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2710 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 2711 return -EIO; 2712 2713 #ifdef CONFIG_X86_64 2714 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 2715 if (vmx_msr_high & (1u<<16)) 2716 return -EIO; 2717 #endif 2718 2719 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2720 if (((vmx_msr_high >> 18) & 15) != 6) 2721 return -EIO; 2722 2723 rdmsrl(MSR_IA32_VMX_MISC, misc_msr); 2724 2725 vmcs_conf->size = vmx_msr_high & 0x1fff; 2726 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; 2727 2728 vmcs_conf->revision_id = vmx_msr_low; 2729 2730 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2731 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2732 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2733 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2734 vmcs_conf->vmexit_ctrl = _vmexit_control; 2735 vmcs_conf->vmentry_ctrl = _vmentry_control; 2736 vmcs_conf->misc = misc_msr; 2737 2738 #if IS_ENABLED(CONFIG_HYPERV) 2739 if (enlightened_vmcs) 2740 evmcs_sanitize_exec_ctrls(vmcs_conf); 2741 #endif 2742 2743 return 0; 2744 } 2745 2746 static bool __kvm_is_vmx_supported(void) 2747 { 2748 int cpu = smp_processor_id(); 2749 2750 if (!(cpuid_ecx(1) & feature_bit(VMX))) { 2751 pr_err("VMX not supported by CPU %d\n", cpu); 2752 return false; 2753 } 2754 2755 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2756 !this_cpu_has(X86_FEATURE_VMX)) { 2757 pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu); 2758 return false; 2759 } 2760 2761 return true; 2762 } 2763 2764 static bool kvm_is_vmx_supported(void) 2765 { 2766 bool supported; 2767 2768 migrate_disable(); 2769 supported = __kvm_is_vmx_supported(); 2770 migrate_enable(); 2771 2772 return supported; 2773 } 2774 2775 static int vmx_check_processor_compat(void) 2776 { 2777 int cpu = raw_smp_processor_id(); 2778 struct vmcs_config vmcs_conf; 2779 struct vmx_capability vmx_cap; 2780 2781 if (!__kvm_is_vmx_supported()) 2782 return -EIO; 2783 2784 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) { 2785 pr_err("Failed to setup VMCS config on CPU %d\n", cpu); 2786 return -EIO; 2787 } 2788 if (nested) 2789 nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept); 2790 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) { 2791 pr_err("Inconsistent VMCS config on CPU %d\n", cpu); 2792 return -EIO; 2793 } 2794 return 0; 2795 } 2796 2797 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2798 { 2799 u64 msr; 2800 2801 cr4_set_bits(X86_CR4_VMXE); 2802 2803 asm goto("1: vmxon %[vmxon_pointer]\n\t" 2804 _ASM_EXTABLE(1b, %l[fault]) 2805 : : [vmxon_pointer] "m"(vmxon_pointer) 2806 : : fault); 2807 return 0; 2808 2809 fault: 2810 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2811 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2812 cr4_clear_bits(X86_CR4_VMXE); 2813 2814 return -EFAULT; 2815 } 2816 2817 static int vmx_hardware_enable(void) 2818 { 2819 int cpu = raw_smp_processor_id(); 2820 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2821 int r; 2822 2823 if (cr4_read_shadow() & X86_CR4_VMXE) 2824 return -EBUSY; 2825 2826 /* 2827 * This can happen if we hot-added a CPU but failed to allocate 2828 * VP assist page for it. 2829 */ 2830 if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu)) 2831 return -EFAULT; 2832 2833 intel_pt_handle_vmx(1); 2834 2835 r = kvm_cpu_vmxon(phys_addr); 2836 if (r) { 2837 intel_pt_handle_vmx(0); 2838 return r; 2839 } 2840 2841 if (enable_ept) 2842 ept_sync_global(); 2843 2844 return 0; 2845 } 2846 2847 static void vmclear_local_loaded_vmcss(void) 2848 { 2849 int cpu = raw_smp_processor_id(); 2850 struct loaded_vmcs *v, *n; 2851 2852 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2853 loaded_vmcss_on_cpu_link) 2854 __loaded_vmcs_clear(v); 2855 } 2856 2857 static void vmx_hardware_disable(void) 2858 { 2859 vmclear_local_loaded_vmcss(); 2860 2861 if (kvm_cpu_vmxoff()) 2862 kvm_spurious_fault(); 2863 2864 hv_reset_evmcs(); 2865 2866 intel_pt_handle_vmx(0); 2867 } 2868 2869 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2870 { 2871 int node = cpu_to_node(cpu); 2872 struct page *pages; 2873 struct vmcs *vmcs; 2874 2875 pages = __alloc_pages_node(node, flags, 0); 2876 if (!pages) 2877 return NULL; 2878 vmcs = page_address(pages); 2879 memset(vmcs, 0, vmcs_config.size); 2880 2881 /* KVM supports Enlightened VMCS v1 only */ 2882 if (kvm_is_using_evmcs()) 2883 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2884 else 2885 vmcs->hdr.revision_id = vmcs_config.revision_id; 2886 2887 if (shadow) 2888 vmcs->hdr.shadow_vmcs = 1; 2889 return vmcs; 2890 } 2891 2892 void free_vmcs(struct vmcs *vmcs) 2893 { 2894 free_page((unsigned long)vmcs); 2895 } 2896 2897 /* 2898 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2899 */ 2900 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2901 { 2902 if (!loaded_vmcs->vmcs) 2903 return; 2904 loaded_vmcs_clear(loaded_vmcs); 2905 free_vmcs(loaded_vmcs->vmcs); 2906 loaded_vmcs->vmcs = NULL; 2907 if (loaded_vmcs->msr_bitmap) 2908 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2909 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2910 } 2911 2912 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2913 { 2914 loaded_vmcs->vmcs = alloc_vmcs(false); 2915 if (!loaded_vmcs->vmcs) 2916 return -ENOMEM; 2917 2918 vmcs_clear(loaded_vmcs->vmcs); 2919 2920 loaded_vmcs->shadow_vmcs = NULL; 2921 loaded_vmcs->hv_timer_soft_disabled = false; 2922 loaded_vmcs->cpu = -1; 2923 loaded_vmcs->launched = 0; 2924 2925 if (cpu_has_vmx_msr_bitmap()) { 2926 loaded_vmcs->msr_bitmap = (unsigned long *) 2927 __get_free_page(GFP_KERNEL_ACCOUNT); 2928 if (!loaded_vmcs->msr_bitmap) 2929 goto out_vmcs; 2930 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2931 } 2932 2933 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2934 memset(&loaded_vmcs->controls_shadow, 0, 2935 sizeof(struct vmcs_controls_shadow)); 2936 2937 return 0; 2938 2939 out_vmcs: 2940 free_loaded_vmcs(loaded_vmcs); 2941 return -ENOMEM; 2942 } 2943 2944 static void free_kvm_area(void) 2945 { 2946 int cpu; 2947 2948 for_each_possible_cpu(cpu) { 2949 free_vmcs(per_cpu(vmxarea, cpu)); 2950 per_cpu(vmxarea, cpu) = NULL; 2951 } 2952 } 2953 2954 static __init int alloc_kvm_area(void) 2955 { 2956 int cpu; 2957 2958 for_each_possible_cpu(cpu) { 2959 struct vmcs *vmcs; 2960 2961 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 2962 if (!vmcs) { 2963 free_kvm_area(); 2964 return -ENOMEM; 2965 } 2966 2967 /* 2968 * When eVMCS is enabled, alloc_vmcs_cpu() sets 2969 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 2970 * revision_id reported by MSR_IA32_VMX_BASIC. 2971 * 2972 * However, even though not explicitly documented by 2973 * TLFS, VMXArea passed as VMXON argument should 2974 * still be marked with revision_id reported by 2975 * physical CPU. 2976 */ 2977 if (kvm_is_using_evmcs()) 2978 vmcs->hdr.revision_id = vmcs_config.revision_id; 2979 2980 per_cpu(vmxarea, cpu) = vmcs; 2981 } 2982 return 0; 2983 } 2984 2985 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 2986 struct kvm_segment *save) 2987 { 2988 if (!emulate_invalid_guest_state) { 2989 /* 2990 * CS and SS RPL should be equal during guest entry according 2991 * to VMX spec, but in reality it is not always so. Since vcpu 2992 * is in the middle of the transition from real mode to 2993 * protected mode it is safe to assume that RPL 0 is a good 2994 * default value. 2995 */ 2996 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 2997 save->selector &= ~SEGMENT_RPL_MASK; 2998 save->dpl = save->selector & SEGMENT_RPL_MASK; 2999 save->s = 1; 3000 } 3001 __vmx_set_segment(vcpu, save, seg); 3002 } 3003 3004 static void enter_pmode(struct kvm_vcpu *vcpu) 3005 { 3006 unsigned long flags; 3007 struct vcpu_vmx *vmx = to_vmx(vcpu); 3008 3009 /* 3010 * Update real mode segment cache. It may be not up-to-date if segment 3011 * register was written while vcpu was in a guest mode. 3012 */ 3013 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3014 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3015 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3016 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3017 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3018 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3019 3020 vmx->rmode.vm86_active = 0; 3021 3022 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3023 3024 flags = vmcs_readl(GUEST_RFLAGS); 3025 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3026 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3027 vmcs_writel(GUEST_RFLAGS, flags); 3028 3029 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3030 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3031 3032 vmx_update_exception_bitmap(vcpu); 3033 3034 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3035 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3036 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3037 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3038 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3039 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3040 } 3041 3042 static void fix_rmode_seg(int seg, struct kvm_segment *save) 3043 { 3044 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3045 struct kvm_segment var = *save; 3046 3047 var.dpl = 0x3; 3048 if (seg == VCPU_SREG_CS) 3049 var.type = 0x3; 3050 3051 if (!emulate_invalid_guest_state) { 3052 var.selector = var.base >> 4; 3053 var.base = var.base & 0xffff0; 3054 var.limit = 0xffff; 3055 var.g = 0; 3056 var.db = 0; 3057 var.present = 1; 3058 var.s = 1; 3059 var.l = 0; 3060 var.unusable = 0; 3061 var.type = 0x3; 3062 var.avl = 0; 3063 if (save->base & 0xf) 3064 pr_warn_once("segment base is not paragraph aligned " 3065 "when entering protected mode (seg=%d)", seg); 3066 } 3067 3068 vmcs_write16(sf->selector, var.selector); 3069 vmcs_writel(sf->base, var.base); 3070 vmcs_write32(sf->limit, var.limit); 3071 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3072 } 3073 3074 static void enter_rmode(struct kvm_vcpu *vcpu) 3075 { 3076 unsigned long flags; 3077 struct vcpu_vmx *vmx = to_vmx(vcpu); 3078 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 3079 3080 /* 3081 * KVM should never use VM86 to virtualize Real Mode when L2 is active, 3082 * as using VM86 is unnecessary if unrestricted guest is enabled, and 3083 * if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0 3084 * should VM-Fail and KVM should reject userspace attempts to stuff 3085 * CR0.PG=0 when L2 is active. 3086 */ 3087 WARN_ON_ONCE(is_guest_mode(vcpu)); 3088 3089 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3090 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3091 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3092 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3093 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3094 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3095 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3096 3097 vmx->rmode.vm86_active = 1; 3098 3099 vmx_segment_cache_clear(vmx); 3100 3101 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3102 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3103 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3104 3105 flags = vmcs_readl(GUEST_RFLAGS); 3106 vmx->rmode.save_rflags = flags; 3107 3108 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3109 3110 vmcs_writel(GUEST_RFLAGS, flags); 3111 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3112 vmx_update_exception_bitmap(vcpu); 3113 3114 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3115 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3116 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3117 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3118 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3119 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3120 } 3121 3122 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3123 { 3124 struct vcpu_vmx *vmx = to_vmx(vcpu); 3125 3126 /* Nothing to do if hardware doesn't support EFER. */ 3127 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3128 return 0; 3129 3130 vcpu->arch.efer = efer; 3131 #ifdef CONFIG_X86_64 3132 if (efer & EFER_LMA) 3133 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3134 else 3135 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3136 #else 3137 if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm)) 3138 return 1; 3139 #endif 3140 3141 vmx_setup_uret_msrs(vmx); 3142 return 0; 3143 } 3144 3145 #ifdef CONFIG_X86_64 3146 3147 static void enter_lmode(struct kvm_vcpu *vcpu) 3148 { 3149 u32 guest_tr_ar; 3150 3151 vmx_segment_cache_clear(to_vmx(vcpu)); 3152 3153 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3154 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3155 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3156 __func__); 3157 vmcs_write32(GUEST_TR_AR_BYTES, 3158 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3159 | VMX_AR_TYPE_BUSY_64_TSS); 3160 } 3161 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3162 } 3163 3164 static void exit_lmode(struct kvm_vcpu *vcpu) 3165 { 3166 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3167 } 3168 3169 #endif 3170 3171 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3172 { 3173 struct vcpu_vmx *vmx = to_vmx(vcpu); 3174 3175 /* 3176 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3177 * the CPU is not required to invalidate guest-physical mappings on 3178 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3179 * associated with the root EPT structure and not any particular VPID 3180 * (INVVPID also isn't required to invalidate guest-physical mappings). 3181 */ 3182 if (enable_ept) { 3183 ept_sync_global(); 3184 } else if (enable_vpid) { 3185 if (cpu_has_vmx_invvpid_global()) { 3186 vpid_sync_vcpu_global(); 3187 } else { 3188 vpid_sync_vcpu_single(vmx->vpid); 3189 vpid_sync_vcpu_single(vmx->nested.vpid02); 3190 } 3191 } 3192 } 3193 3194 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3195 { 3196 if (is_guest_mode(vcpu)) 3197 return nested_get_vpid02(vcpu); 3198 return to_vmx(vcpu)->vpid; 3199 } 3200 3201 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3202 { 3203 struct kvm_mmu *mmu = vcpu->arch.mmu; 3204 u64 root_hpa = mmu->root.hpa; 3205 3206 /* No flush required if the current context is invalid. */ 3207 if (!VALID_PAGE(root_hpa)) 3208 return; 3209 3210 if (enable_ept) 3211 ept_sync_context(construct_eptp(vcpu, root_hpa, 3212 mmu->root_role.level)); 3213 else 3214 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3215 } 3216 3217 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3218 { 3219 /* 3220 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3221 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3222 */ 3223 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3224 } 3225 3226 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3227 { 3228 /* 3229 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3230 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3231 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3232 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3233 * i.e. no explicit INVVPID is necessary. 3234 */ 3235 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3236 } 3237 3238 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3239 { 3240 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3241 3242 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3243 return; 3244 3245 if (is_pae_paging(vcpu)) { 3246 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3247 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3248 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3249 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3250 } 3251 } 3252 3253 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3254 { 3255 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3256 3257 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3258 return; 3259 3260 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3261 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3262 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3263 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3264 3265 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3266 } 3267 3268 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3269 CPU_BASED_CR3_STORE_EXITING) 3270 3271 static bool vmx_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3272 { 3273 if (is_guest_mode(vcpu)) 3274 return nested_guest_cr0_valid(vcpu, cr0); 3275 3276 if (to_vmx(vcpu)->nested.vmxon) 3277 return nested_host_cr0_valid(vcpu, cr0); 3278 3279 return true; 3280 } 3281 3282 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3283 { 3284 struct vcpu_vmx *vmx = to_vmx(vcpu); 3285 unsigned long hw_cr0, old_cr0_pg; 3286 u32 tmp; 3287 3288 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3289 3290 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3291 if (enable_unrestricted_guest) 3292 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3293 else { 3294 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3295 if (!enable_ept) 3296 hw_cr0 |= X86_CR0_WP; 3297 3298 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3299 enter_pmode(vcpu); 3300 3301 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3302 enter_rmode(vcpu); 3303 } 3304 3305 vmcs_writel(CR0_READ_SHADOW, cr0); 3306 vmcs_writel(GUEST_CR0, hw_cr0); 3307 vcpu->arch.cr0 = cr0; 3308 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3309 3310 #ifdef CONFIG_X86_64 3311 if (vcpu->arch.efer & EFER_LME) { 3312 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3313 enter_lmode(vcpu); 3314 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3315 exit_lmode(vcpu); 3316 } 3317 #endif 3318 3319 if (enable_ept && !enable_unrestricted_guest) { 3320 /* 3321 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3322 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3323 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3324 * KVM's CR3 is installed. 3325 */ 3326 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3327 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3328 3329 /* 3330 * When running with EPT but not unrestricted guest, KVM must 3331 * intercept CR3 accesses when paging is _disabled_. This is 3332 * necessary because restricted guests can't actually run with 3333 * paging disabled, and so KVM stuffs its own CR3 in order to 3334 * run the guest when identity mapped page tables. 3335 * 3336 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3337 * update, it may be stale with respect to CR3 interception, 3338 * e.g. after nested VM-Enter. 3339 * 3340 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3341 * stores to forward them to L1, even if KVM does not need to 3342 * intercept them to preserve its identity mapped page tables. 3343 */ 3344 if (!(cr0 & X86_CR0_PG)) { 3345 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3346 } else if (!is_guest_mode(vcpu)) { 3347 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3348 } else { 3349 tmp = exec_controls_get(vmx); 3350 tmp &= ~CR3_EXITING_BITS; 3351 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3352 exec_controls_set(vmx, tmp); 3353 } 3354 3355 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3356 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3357 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3358 3359 /* 3360 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3361 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3362 */ 3363 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3364 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3365 } 3366 3367 /* depends on vcpu->arch.cr0 to be set to a new value */ 3368 vmx->emulation_required = vmx_emulation_required(vcpu); 3369 } 3370 3371 static int vmx_get_max_ept_level(void) 3372 { 3373 if (cpu_has_vmx_ept_5levels()) 3374 return 5; 3375 return 4; 3376 } 3377 3378 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3379 { 3380 u64 eptp = VMX_EPTP_MT_WB; 3381 3382 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3383 3384 if (enable_ept_ad_bits && 3385 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 3386 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3387 eptp |= root_hpa; 3388 3389 return eptp; 3390 } 3391 3392 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, 3393 int root_level) 3394 { 3395 struct kvm *kvm = vcpu->kvm; 3396 bool update_guest_cr3 = true; 3397 unsigned long guest_cr3; 3398 u64 eptp; 3399 3400 if (enable_ept) { 3401 eptp = construct_eptp(vcpu, root_hpa, root_level); 3402 vmcs_write64(EPT_POINTER, eptp); 3403 3404 hv_track_root_tdp(vcpu, root_hpa); 3405 3406 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3407 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3408 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3409 guest_cr3 = vcpu->arch.cr3; 3410 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3411 update_guest_cr3 = false; 3412 vmx_ept_load_pdptrs(vcpu); 3413 } else { 3414 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu); 3415 } 3416 3417 if (update_guest_cr3) 3418 vmcs_writel(GUEST_CR3, guest_cr3); 3419 } 3420 3421 3422 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3423 { 3424 /* 3425 * We operate under the default treatment of SMM, so VMX cannot be 3426 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3427 * i.e. is a reserved bit, is handled by common x86 code. 3428 */ 3429 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3430 return false; 3431 3432 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3433 return false; 3434 3435 return true; 3436 } 3437 3438 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3439 { 3440 unsigned long old_cr4 = kvm_read_cr4(vcpu); 3441 struct vcpu_vmx *vmx = to_vmx(vcpu); 3442 unsigned long hw_cr4; 3443 3444 /* 3445 * Pass through host's Machine Check Enable value to hw_cr4, which 3446 * is in force while we are in guest mode. Do not let guests control 3447 * this bit, even if host CR4.MCE == 0. 3448 */ 3449 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3450 if (enable_unrestricted_guest) 3451 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3452 else if (vmx->rmode.vm86_active) 3453 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3454 else 3455 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3456 3457 if (vmx_umip_emulated()) { 3458 if (cr4 & X86_CR4_UMIP) { 3459 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3460 hw_cr4 &= ~X86_CR4_UMIP; 3461 } else if (!is_guest_mode(vcpu) || 3462 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3463 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3464 } 3465 } 3466 3467 vcpu->arch.cr4 = cr4; 3468 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3469 3470 if (!enable_unrestricted_guest) { 3471 if (enable_ept) { 3472 if (!is_paging(vcpu)) { 3473 hw_cr4 &= ~X86_CR4_PAE; 3474 hw_cr4 |= X86_CR4_PSE; 3475 } else if (!(cr4 & X86_CR4_PAE)) { 3476 hw_cr4 &= ~X86_CR4_PAE; 3477 } 3478 } 3479 3480 /* 3481 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3482 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3483 * to be manually disabled when guest switches to non-paging 3484 * mode. 3485 * 3486 * If !enable_unrestricted_guest, the CPU is always running 3487 * with CR0.PG=1 and CR4 needs to be modified. 3488 * If enable_unrestricted_guest, the CPU automatically 3489 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3490 */ 3491 if (!is_paging(vcpu)) 3492 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3493 } 3494 3495 vmcs_writel(CR4_READ_SHADOW, cr4); 3496 vmcs_writel(GUEST_CR4, hw_cr4); 3497 3498 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3499 kvm_update_cpuid_runtime(vcpu); 3500 } 3501 3502 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3503 { 3504 struct vcpu_vmx *vmx = to_vmx(vcpu); 3505 u32 ar; 3506 3507 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3508 *var = vmx->rmode.segs[seg]; 3509 if (seg == VCPU_SREG_TR 3510 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3511 return; 3512 var->base = vmx_read_guest_seg_base(vmx, seg); 3513 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3514 return; 3515 } 3516 var->base = vmx_read_guest_seg_base(vmx, seg); 3517 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3518 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3519 ar = vmx_read_guest_seg_ar(vmx, seg); 3520 var->unusable = (ar >> 16) & 1; 3521 var->type = ar & 15; 3522 var->s = (ar >> 4) & 1; 3523 var->dpl = (ar >> 5) & 3; 3524 /* 3525 * Some userspaces do not preserve unusable property. Since usable 3526 * segment has to be present according to VMX spec we can use present 3527 * property to amend userspace bug by making unusable segment always 3528 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3529 * segment as unusable. 3530 */ 3531 var->present = !var->unusable; 3532 var->avl = (ar >> 12) & 1; 3533 var->l = (ar >> 13) & 1; 3534 var->db = (ar >> 14) & 1; 3535 var->g = (ar >> 15) & 1; 3536 } 3537 3538 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3539 { 3540 struct kvm_segment s; 3541 3542 if (to_vmx(vcpu)->rmode.vm86_active) { 3543 vmx_get_segment(vcpu, &s, seg); 3544 return s.base; 3545 } 3546 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3547 } 3548 3549 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3550 { 3551 struct vcpu_vmx *vmx = to_vmx(vcpu); 3552 3553 if (unlikely(vmx->rmode.vm86_active)) 3554 return 0; 3555 else { 3556 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3557 return VMX_AR_DPL(ar); 3558 } 3559 } 3560 3561 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3562 { 3563 u32 ar; 3564 3565 ar = var->type & 15; 3566 ar |= (var->s & 1) << 4; 3567 ar |= (var->dpl & 3) << 5; 3568 ar |= (var->present & 1) << 7; 3569 ar |= (var->avl & 1) << 12; 3570 ar |= (var->l & 1) << 13; 3571 ar |= (var->db & 1) << 14; 3572 ar |= (var->g & 1) << 15; 3573 ar |= (var->unusable || !var->present) << 16; 3574 3575 return ar; 3576 } 3577 3578 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3579 { 3580 struct vcpu_vmx *vmx = to_vmx(vcpu); 3581 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3582 3583 vmx_segment_cache_clear(vmx); 3584 3585 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3586 vmx->rmode.segs[seg] = *var; 3587 if (seg == VCPU_SREG_TR) 3588 vmcs_write16(sf->selector, var->selector); 3589 else if (var->s) 3590 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3591 return; 3592 } 3593 3594 vmcs_writel(sf->base, var->base); 3595 vmcs_write32(sf->limit, var->limit); 3596 vmcs_write16(sf->selector, var->selector); 3597 3598 /* 3599 * Fix the "Accessed" bit in AR field of segment registers for older 3600 * qemu binaries. 3601 * IA32 arch specifies that at the time of processor reset the 3602 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3603 * is setting it to 0 in the userland code. This causes invalid guest 3604 * state vmexit when "unrestricted guest" mode is turned on. 3605 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3606 * tree. Newer qemu binaries with that qemu fix would not need this 3607 * kvm hack. 3608 */ 3609 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3610 var->type |= 0x1; /* Accessed */ 3611 3612 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3613 } 3614 3615 static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3616 { 3617 __vmx_set_segment(vcpu, var, seg); 3618 3619 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 3620 } 3621 3622 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3623 { 3624 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3625 3626 *db = (ar >> 14) & 1; 3627 *l = (ar >> 13) & 1; 3628 } 3629 3630 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3631 { 3632 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3633 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3634 } 3635 3636 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3637 { 3638 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3639 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3640 } 3641 3642 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3643 { 3644 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3645 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3646 } 3647 3648 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3649 { 3650 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3651 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3652 } 3653 3654 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3655 { 3656 struct kvm_segment var; 3657 u32 ar; 3658 3659 vmx_get_segment(vcpu, &var, seg); 3660 var.dpl = 0x3; 3661 if (seg == VCPU_SREG_CS) 3662 var.type = 0x3; 3663 ar = vmx_segment_access_rights(&var); 3664 3665 if (var.base != (var.selector << 4)) 3666 return false; 3667 if (var.limit != 0xffff) 3668 return false; 3669 if (ar != 0xf3) 3670 return false; 3671 3672 return true; 3673 } 3674 3675 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3676 { 3677 struct kvm_segment cs; 3678 unsigned int cs_rpl; 3679 3680 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3681 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3682 3683 if (cs.unusable) 3684 return false; 3685 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3686 return false; 3687 if (!cs.s) 3688 return false; 3689 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3690 if (cs.dpl > cs_rpl) 3691 return false; 3692 } else { 3693 if (cs.dpl != cs_rpl) 3694 return false; 3695 } 3696 if (!cs.present) 3697 return false; 3698 3699 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3700 return true; 3701 } 3702 3703 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3704 { 3705 struct kvm_segment ss; 3706 unsigned int ss_rpl; 3707 3708 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3709 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3710 3711 if (ss.unusable) 3712 return true; 3713 if (ss.type != 3 && ss.type != 7) 3714 return false; 3715 if (!ss.s) 3716 return false; 3717 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3718 return false; 3719 if (!ss.present) 3720 return false; 3721 3722 return true; 3723 } 3724 3725 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3726 { 3727 struct kvm_segment var; 3728 unsigned int rpl; 3729 3730 vmx_get_segment(vcpu, &var, seg); 3731 rpl = var.selector & SEGMENT_RPL_MASK; 3732 3733 if (var.unusable) 3734 return true; 3735 if (!var.s) 3736 return false; 3737 if (!var.present) 3738 return false; 3739 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3740 if (var.dpl < rpl) /* DPL < RPL */ 3741 return false; 3742 } 3743 3744 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3745 * rights flags 3746 */ 3747 return true; 3748 } 3749 3750 static bool tr_valid(struct kvm_vcpu *vcpu) 3751 { 3752 struct kvm_segment tr; 3753 3754 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3755 3756 if (tr.unusable) 3757 return false; 3758 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3759 return false; 3760 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3761 return false; 3762 if (!tr.present) 3763 return false; 3764 3765 return true; 3766 } 3767 3768 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3769 { 3770 struct kvm_segment ldtr; 3771 3772 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3773 3774 if (ldtr.unusable) 3775 return true; 3776 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3777 return false; 3778 if (ldtr.type != 2) 3779 return false; 3780 if (!ldtr.present) 3781 return false; 3782 3783 return true; 3784 } 3785 3786 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3787 { 3788 struct kvm_segment cs, ss; 3789 3790 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3791 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3792 3793 return ((cs.selector & SEGMENT_RPL_MASK) == 3794 (ss.selector & SEGMENT_RPL_MASK)); 3795 } 3796 3797 /* 3798 * Check if guest state is valid. Returns true if valid, false if 3799 * not. 3800 * We assume that registers are always usable 3801 */ 3802 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3803 { 3804 /* real mode guest state checks */ 3805 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3806 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3807 return false; 3808 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3809 return false; 3810 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3811 return false; 3812 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3813 return false; 3814 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3815 return false; 3816 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3817 return false; 3818 } else { 3819 /* protected mode guest state checks */ 3820 if (!cs_ss_rpl_check(vcpu)) 3821 return false; 3822 if (!code_segment_valid(vcpu)) 3823 return false; 3824 if (!stack_segment_valid(vcpu)) 3825 return false; 3826 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3827 return false; 3828 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3829 return false; 3830 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3831 return false; 3832 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3833 return false; 3834 if (!tr_valid(vcpu)) 3835 return false; 3836 if (!ldtr_valid(vcpu)) 3837 return false; 3838 } 3839 /* TODO: 3840 * - Add checks on RIP 3841 * - Add checks on RFLAGS 3842 */ 3843 3844 return true; 3845 } 3846 3847 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3848 { 3849 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3850 u16 data; 3851 int i; 3852 3853 for (i = 0; i < 3; i++) { 3854 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3855 return -EFAULT; 3856 } 3857 3858 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3859 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3860 return -EFAULT; 3861 3862 data = ~0; 3863 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3864 return -EFAULT; 3865 3866 return 0; 3867 } 3868 3869 static int init_rmode_identity_map(struct kvm *kvm) 3870 { 3871 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3872 int i, r = 0; 3873 void __user *uaddr; 3874 u32 tmp; 3875 3876 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3877 mutex_lock(&kvm->slots_lock); 3878 3879 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3880 goto out; 3881 3882 if (!kvm_vmx->ept_identity_map_addr) 3883 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3884 3885 uaddr = __x86_set_memory_region(kvm, 3886 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3887 kvm_vmx->ept_identity_map_addr, 3888 PAGE_SIZE); 3889 if (IS_ERR(uaddr)) { 3890 r = PTR_ERR(uaddr); 3891 goto out; 3892 } 3893 3894 /* Set up identity-mapping pagetable for EPT in real mode */ 3895 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 3896 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3897 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3898 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 3899 r = -EFAULT; 3900 goto out; 3901 } 3902 } 3903 kvm_vmx->ept_identity_pagetable_done = true; 3904 3905 out: 3906 mutex_unlock(&kvm->slots_lock); 3907 return r; 3908 } 3909 3910 static void seg_setup(int seg) 3911 { 3912 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3913 unsigned int ar; 3914 3915 vmcs_write16(sf->selector, 0); 3916 vmcs_writel(sf->base, 0); 3917 vmcs_write32(sf->limit, 0xffff); 3918 ar = 0x93; 3919 if (seg == VCPU_SREG_CS) 3920 ar |= 0x08; /* code segment */ 3921 3922 vmcs_write32(sf->ar_bytes, ar); 3923 } 3924 3925 int allocate_vpid(void) 3926 { 3927 int vpid; 3928 3929 if (!enable_vpid) 3930 return 0; 3931 spin_lock(&vmx_vpid_lock); 3932 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3933 if (vpid < VMX_NR_VPIDS) 3934 __set_bit(vpid, vmx_vpid_bitmap); 3935 else 3936 vpid = 0; 3937 spin_unlock(&vmx_vpid_lock); 3938 return vpid; 3939 } 3940 3941 void free_vpid(int vpid) 3942 { 3943 if (!enable_vpid || vpid == 0) 3944 return; 3945 spin_lock(&vmx_vpid_lock); 3946 __clear_bit(vpid, vmx_vpid_bitmap); 3947 spin_unlock(&vmx_vpid_lock); 3948 } 3949 3950 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 3951 { 3952 /* 3953 * When KVM is a nested hypervisor on top of Hyper-V and uses 3954 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 3955 * bitmap has changed. 3956 */ 3957 if (kvm_is_using_evmcs()) { 3958 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 3959 3960 if (evmcs->hv_enlightenments_control.msr_bitmap) 3961 evmcs->hv_clean_fields &= 3962 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; 3963 } 3964 3965 vmx->nested.force_msr_bitmap_recalc = true; 3966 } 3967 3968 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 3969 { 3970 struct vcpu_vmx *vmx = to_vmx(vcpu); 3971 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3972 3973 if (!cpu_has_vmx_msr_bitmap()) 3974 return; 3975 3976 vmx_msr_bitmap_l01_changed(vmx); 3977 3978 /* 3979 * Mark the desired intercept state in shadow bitmap, this is needed 3980 * for resync when the MSR filters change. 3981 */ 3982 if (is_valid_passthrough_msr(msr)) { 3983 int idx = possible_passthrough_msr_slot(msr); 3984 3985 if (idx != -ENOENT) { 3986 if (type & MSR_TYPE_R) 3987 clear_bit(idx, vmx->shadow_msr_intercept.read); 3988 if (type & MSR_TYPE_W) 3989 clear_bit(idx, vmx->shadow_msr_intercept.write); 3990 } 3991 } 3992 3993 if ((type & MSR_TYPE_R) && 3994 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { 3995 vmx_set_msr_bitmap_read(msr_bitmap, msr); 3996 type &= ~MSR_TYPE_R; 3997 } 3998 3999 if ((type & MSR_TYPE_W) && 4000 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { 4001 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4002 type &= ~MSR_TYPE_W; 4003 } 4004 4005 if (type & MSR_TYPE_R) 4006 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 4007 4008 if (type & MSR_TYPE_W) 4009 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 4010 } 4011 4012 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 4013 { 4014 struct vcpu_vmx *vmx = to_vmx(vcpu); 4015 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 4016 4017 if (!cpu_has_vmx_msr_bitmap()) 4018 return; 4019 4020 vmx_msr_bitmap_l01_changed(vmx); 4021 4022 /* 4023 * Mark the desired intercept state in shadow bitmap, this is needed 4024 * for resync when the MSR filter changes. 4025 */ 4026 if (is_valid_passthrough_msr(msr)) { 4027 int idx = possible_passthrough_msr_slot(msr); 4028 4029 if (idx != -ENOENT) { 4030 if (type & MSR_TYPE_R) 4031 set_bit(idx, vmx->shadow_msr_intercept.read); 4032 if (type & MSR_TYPE_W) 4033 set_bit(idx, vmx->shadow_msr_intercept.write); 4034 } 4035 } 4036 4037 if (type & MSR_TYPE_R) 4038 vmx_set_msr_bitmap_read(msr_bitmap, msr); 4039 4040 if (type & MSR_TYPE_W) 4041 vmx_set_msr_bitmap_write(msr_bitmap, msr); 4042 } 4043 4044 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 4045 { 4046 /* 4047 * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves 4048 * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0, 4049 * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits. 4050 */ 4051 const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG; 4052 const int write_idx = read_idx + (0x800 / sizeof(u64)); 4053 struct vcpu_vmx *vmx = to_vmx(vcpu); 4054 u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap; 4055 u8 mode; 4056 4057 if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu))) 4058 return; 4059 4060 if (cpu_has_secondary_exec_ctrls() && 4061 (secondary_exec_controls_get(vmx) & 4062 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 4063 mode = MSR_BITMAP_MODE_X2APIC; 4064 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 4065 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 4066 } else { 4067 mode = 0; 4068 } 4069 4070 if (mode == vmx->x2apic_msr_bitmap_mode) 4071 return; 4072 4073 vmx->x2apic_msr_bitmap_mode = mode; 4074 4075 /* 4076 * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended 4077 * registers (0x840 and above) intercepted, KVM doesn't support them. 4078 * Intercept all writes by default and poke holes as needed. Pass 4079 * through reads for all valid registers by default in x2APIC+APICv 4080 * mode, only the current timer count needs on-demand emulation by KVM. 4081 */ 4082 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) 4083 msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic); 4084 else 4085 msr_bitmap[read_idx] = ~0ull; 4086 msr_bitmap[write_idx] = ~0ull; 4087 4088 /* 4089 * TPR reads and writes can be virtualized even if virtual interrupt 4090 * delivery is not in use. 4091 */ 4092 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4093 !(mode & MSR_BITMAP_MODE_X2APIC)); 4094 4095 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4096 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4097 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4098 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4099 if (enable_ipiv) 4100 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4101 } 4102 } 4103 4104 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4105 { 4106 struct vcpu_vmx *vmx = to_vmx(vcpu); 4107 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4108 u32 i; 4109 4110 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4111 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4112 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4113 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4114 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4115 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4116 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4117 } 4118 } 4119 4120 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 4121 { 4122 struct vcpu_vmx *vmx = to_vmx(vcpu); 4123 void *vapic_page; 4124 u32 vppr; 4125 int rvi; 4126 4127 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || 4128 !nested_cpu_has_vid(get_vmcs12(vcpu)) || 4129 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) 4130 return false; 4131 4132 rvi = vmx_get_rvi(); 4133 4134 vapic_page = vmx->nested.virtual_apic_map.hva; 4135 vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); 4136 4137 return ((rvi & 0xf0) > (vppr & 0xf0)); 4138 } 4139 4140 static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) 4141 { 4142 struct vcpu_vmx *vmx = to_vmx(vcpu); 4143 u32 i; 4144 4145 /* 4146 * Redo intercept permissions for MSRs that KVM is passing through to 4147 * the guest. Disabling interception will check the new MSR filter and 4148 * ensure that KVM enables interception if usersepace wants to filter 4149 * the MSR. MSRs that KVM is already intercepting don't need to be 4150 * refreshed since KVM is going to intercept them regardless of what 4151 * userspace wants. 4152 */ 4153 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 4154 u32 msr = vmx_possible_passthrough_msrs[i]; 4155 4156 if (!test_bit(i, vmx->shadow_msr_intercept.read)) 4157 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); 4158 4159 if (!test_bit(i, vmx->shadow_msr_intercept.write)) 4160 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); 4161 } 4162 4163 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4164 if (vmx_pt_mode_is_host_guest()) 4165 pt_update_intercept_for_msr(vcpu); 4166 } 4167 4168 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 4169 int pi_vec) 4170 { 4171 #ifdef CONFIG_SMP 4172 if (vcpu->mode == IN_GUEST_MODE) { 4173 /* 4174 * The vector of the virtual has already been set in the PIR. 4175 * Send a notification event to deliver the virtual interrupt 4176 * unless the vCPU is the currently running vCPU, i.e. the 4177 * event is being sent from a fastpath VM-Exit handler, in 4178 * which case the PIR will be synced to the vIRR before 4179 * re-entering the guest. 4180 * 4181 * When the target is not the running vCPU, the following 4182 * possibilities emerge: 4183 * 4184 * Case 1: vCPU stays in non-root mode. Sending a notification 4185 * event posts the interrupt to the vCPU. 4186 * 4187 * Case 2: vCPU exits to root mode and is still runnable. The 4188 * PIR will be synced to the vIRR before re-entering the guest. 4189 * Sending a notification event is ok as the host IRQ handler 4190 * will ignore the spurious event. 4191 * 4192 * Case 3: vCPU exits to root mode and is blocked. vcpu_block() 4193 * has already synced PIR to vIRR and never blocks the vCPU if 4194 * the vIRR is not empty. Therefore, a blocked vCPU here does 4195 * not wait for any requested interrupts in PIR, and sending a 4196 * notification event also results in a benign, spurious event. 4197 */ 4198 4199 if (vcpu != kvm_get_running_vcpu()) 4200 __apic_send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); 4201 return; 4202 } 4203 #endif 4204 /* 4205 * The vCPU isn't in the guest; wake the vCPU in case it is blocking, 4206 * otherwise do nothing as KVM will grab the highest priority pending 4207 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). 4208 */ 4209 kvm_vcpu_wake_up(vcpu); 4210 } 4211 4212 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4213 int vector) 4214 { 4215 struct vcpu_vmx *vmx = to_vmx(vcpu); 4216 4217 if (is_guest_mode(vcpu) && 4218 vector == vmx->nested.posted_intr_nv) { 4219 /* 4220 * If a posted intr is not recognized by hardware, 4221 * we will accomplish it in the next vmentry. 4222 */ 4223 vmx->nested.pi_pending = true; 4224 kvm_make_request(KVM_REQ_EVENT, vcpu); 4225 4226 /* 4227 * This pairs with the smp_mb_*() after setting vcpu->mode in 4228 * vcpu_enter_guest() to guarantee the vCPU sees the event 4229 * request if triggering a posted interrupt "fails" because 4230 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4231 * the smb_wmb() in kvm_make_request() only ensures everything 4232 * done before making the request is visible when the request 4233 * is visible, it doesn't ensure ordering between the store to 4234 * vcpu->requests and the load from vcpu->mode. 4235 */ 4236 smp_mb__after_atomic(); 4237 4238 /* the PIR and ON have been set by L1. */ 4239 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4240 return 0; 4241 } 4242 return -1; 4243 } 4244 /* 4245 * Send interrupt to vcpu via posted interrupt way. 4246 * 1. If target vcpu is running(non-root mode), send posted interrupt 4247 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4248 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4249 * interrupt from PIR in next vmentry. 4250 */ 4251 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4252 { 4253 struct vcpu_vmx *vmx = to_vmx(vcpu); 4254 int r; 4255 4256 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4257 if (!r) 4258 return 0; 4259 4260 /* Note, this is called iff the local APIC is in-kernel. */ 4261 if (!vcpu->arch.apic->apicv_active) 4262 return -1; 4263 4264 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4265 return 0; 4266 4267 /* If a previous notification has sent the IPI, nothing to do. */ 4268 if (pi_test_and_set_on(&vmx->pi_desc)) 4269 return 0; 4270 4271 /* 4272 * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() 4273 * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is 4274 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a 4275 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. 4276 */ 4277 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); 4278 return 0; 4279 } 4280 4281 static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4282 int trig_mode, int vector) 4283 { 4284 struct kvm_vcpu *vcpu = apic->vcpu; 4285 4286 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4287 kvm_lapic_set_irr(vector, apic); 4288 kvm_make_request(KVM_REQ_EVENT, vcpu); 4289 kvm_vcpu_kick(vcpu); 4290 } else { 4291 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4292 trig_mode, vector); 4293 } 4294 } 4295 4296 /* 4297 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4298 * will not change in the lifetime of the guest. 4299 * Note that host-state that does change is set elsewhere. E.g., host-state 4300 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4301 */ 4302 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4303 { 4304 u32 low32, high32; 4305 unsigned long tmpl; 4306 unsigned long cr0, cr3, cr4; 4307 4308 cr0 = read_cr0(); 4309 WARN_ON(cr0 & X86_CR0_TS); 4310 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4311 4312 /* 4313 * Save the most likely value for this task's CR3 in the VMCS. 4314 * We can't use __get_current_cr3_fast() because we're not atomic. 4315 */ 4316 cr3 = __read_cr3(); 4317 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4318 vmx->loaded_vmcs->host_state.cr3 = cr3; 4319 4320 /* Save the most likely value for this task's CR4 in the VMCS. */ 4321 cr4 = cr4_read_shadow(); 4322 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4323 vmx->loaded_vmcs->host_state.cr4 = cr4; 4324 4325 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4326 #ifdef CONFIG_X86_64 4327 /* 4328 * Load null selectors, so we can avoid reloading them in 4329 * vmx_prepare_switch_to_host(), in case userspace uses 4330 * the null selectors too (the expected case). 4331 */ 4332 vmcs_write16(HOST_DS_SELECTOR, 0); 4333 vmcs_write16(HOST_ES_SELECTOR, 0); 4334 #else 4335 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4336 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4337 #endif 4338 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4339 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4340 4341 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4342 4343 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4344 4345 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4346 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4347 4348 /* 4349 * SYSENTER is used for 32-bit system calls on either 32-bit or 4350 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4351 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4352 * have already done so!). 4353 */ 4354 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4355 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4356 4357 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4358 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4359 4360 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4361 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4362 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4363 } 4364 4365 if (cpu_has_load_ia32_efer()) 4366 vmcs_write64(HOST_IA32_EFER, host_efer); 4367 } 4368 4369 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4370 { 4371 struct kvm_vcpu *vcpu = &vmx->vcpu; 4372 4373 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4374 ~vcpu->arch.cr4_guest_rsvd_bits; 4375 if (!enable_ept) { 4376 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4377 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4378 } 4379 if (is_guest_mode(&vmx->vcpu)) 4380 vcpu->arch.cr4_guest_owned_bits &= 4381 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4382 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4383 } 4384 4385 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4386 { 4387 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4388 4389 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4390 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4391 4392 if (!enable_vnmi) 4393 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4394 4395 if (!enable_preemption_timer) 4396 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4397 4398 return pin_based_exec_ctrl; 4399 } 4400 4401 static u32 vmx_vmentry_ctrl(void) 4402 { 4403 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4404 4405 if (vmx_pt_mode_is_system()) 4406 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4407 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4408 /* 4409 * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically. 4410 */ 4411 vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 4412 VM_ENTRY_LOAD_IA32_EFER | 4413 VM_ENTRY_IA32E_MODE); 4414 4415 if (cpu_has_perf_global_ctrl_bug()) 4416 vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 4417 4418 return vmentry_ctrl; 4419 } 4420 4421 static u32 vmx_vmexit_ctrl(void) 4422 { 4423 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4424 4425 /* 4426 * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for 4427 * nested virtualization and thus allowed to be set in vmcs12. 4428 */ 4429 vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER | 4430 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER); 4431 4432 if (vmx_pt_mode_is_system()) 4433 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4434 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4435 4436 if (cpu_has_perf_global_ctrl_bug()) 4437 vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 4438 4439 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4440 return vmexit_ctrl & 4441 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4442 } 4443 4444 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4445 { 4446 struct vcpu_vmx *vmx = to_vmx(vcpu); 4447 4448 if (is_guest_mode(vcpu)) { 4449 vmx->nested.update_vmcs01_apicv_status = true; 4450 return; 4451 } 4452 4453 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4454 4455 if (kvm_vcpu_apicv_active(vcpu)) { 4456 secondary_exec_controls_setbit(vmx, 4457 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4458 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4459 if (enable_ipiv) 4460 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4461 } else { 4462 secondary_exec_controls_clearbit(vmx, 4463 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4464 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4465 if (enable_ipiv) 4466 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4467 } 4468 4469 vmx_update_msr_bitmap_x2apic(vcpu); 4470 } 4471 4472 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4473 { 4474 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4475 4476 /* 4477 * Not used by KVM, but fully supported for nesting, i.e. are allowed in 4478 * vmcs12 and propagated to vmcs02 when set in vmcs12. 4479 */ 4480 exec_control &= ~(CPU_BASED_RDTSC_EXITING | 4481 CPU_BASED_USE_IO_BITMAPS | 4482 CPU_BASED_MONITOR_TRAP_FLAG | 4483 CPU_BASED_PAUSE_EXITING); 4484 4485 /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */ 4486 exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING | 4487 CPU_BASED_NMI_WINDOW_EXITING); 4488 4489 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4490 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4491 4492 if (!cpu_need_tpr_shadow(&vmx->vcpu)) 4493 exec_control &= ~CPU_BASED_TPR_SHADOW; 4494 4495 #ifdef CONFIG_X86_64 4496 if (exec_control & CPU_BASED_TPR_SHADOW) 4497 exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING | 4498 CPU_BASED_CR8_STORE_EXITING); 4499 else 4500 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4501 CPU_BASED_CR8_LOAD_EXITING; 4502 #endif 4503 /* No need to intercept CR3 access or INVPLG when using EPT. */ 4504 if (enable_ept) 4505 exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 4506 CPU_BASED_CR3_STORE_EXITING | 4507 CPU_BASED_INVLPG_EXITING); 4508 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4509 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4510 CPU_BASED_MONITOR_EXITING); 4511 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4512 exec_control &= ~CPU_BASED_HLT_EXITING; 4513 return exec_control; 4514 } 4515 4516 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4517 { 4518 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4519 4520 /* 4521 * IPI virtualization relies on APICv. Disable IPI virtualization if 4522 * APICv is inhibited. 4523 */ 4524 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4525 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4526 4527 return exec_control; 4528 } 4529 4530 /* 4531 * Adjust a single secondary execution control bit to intercept/allow an 4532 * instruction in the guest. This is usually done based on whether or not a 4533 * feature has been exposed to the guest in order to correctly emulate faults. 4534 */ 4535 static inline void 4536 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4537 u32 control, bool enabled, bool exiting) 4538 { 4539 /* 4540 * If the control is for an opt-in feature, clear the control if the 4541 * feature is not exposed to the guest, i.e. not enabled. If the 4542 * control is opt-out, i.e. an exiting control, clear the control if 4543 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4544 * disabled for the associated instruction. Note, the caller is 4545 * responsible presetting exec_control to set all supported bits. 4546 */ 4547 if (enabled == exiting) 4548 *exec_control &= ~control; 4549 4550 /* 4551 * Update the nested MSR settings so that a nested VMM can/can't set 4552 * controls for features that are/aren't exposed to the guest. 4553 */ 4554 if (nested) { 4555 /* 4556 * All features that can be added or removed to VMX MSRs must 4557 * be supported in the first place for nested virtualization. 4558 */ 4559 if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) 4560 enabled = false; 4561 4562 if (enabled) 4563 vmx->nested.msrs.secondary_ctls_high |= control; 4564 else 4565 vmx->nested.msrs.secondary_ctls_high &= ~control; 4566 } 4567 } 4568 4569 /* 4570 * Wrapper macro for the common case of adjusting a secondary execution control 4571 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4572 * verifies that the control is actually supported by KVM and hardware. 4573 */ 4574 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4575 ({ \ 4576 struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \ 4577 bool __enabled; \ 4578 \ 4579 if (cpu_has_vmx_##name()) { \ 4580 if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \ 4581 __enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \ 4582 else \ 4583 __enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \ 4584 vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\ 4585 __enabled, exiting); \ 4586 } \ 4587 }) 4588 4589 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4590 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4591 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4592 4593 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4594 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4595 4596 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4597 { 4598 struct kvm_vcpu *vcpu = &vmx->vcpu; 4599 4600 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4601 4602 if (vmx_pt_mode_is_system()) 4603 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4604 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4605 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4606 if (vmx->vpid == 0) 4607 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4608 if (!enable_ept) { 4609 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4610 enable_unrestricted_guest = 0; 4611 } 4612 if (!enable_unrestricted_guest) 4613 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4614 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4615 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4616 if (!kvm_vcpu_apicv_active(vcpu)) 4617 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4618 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4619 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4620 4621 /* 4622 * KVM doesn't support VMFUNC for L1, but the control is set in KVM's 4623 * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2. 4624 */ 4625 exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC; 4626 4627 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4628 * in vmx_set_cr4. */ 4629 exec_control &= ~SECONDARY_EXEC_DESC; 4630 4631 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4632 (handle_vmptrld). 4633 We can NOT enable shadow_vmcs here because we don't have yet 4634 a current VMCS12 4635 */ 4636 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4637 4638 /* 4639 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4640 * it needs to be set here when dirty logging is already active, e.g. 4641 * if this vCPU was created after dirty logging was enabled. 4642 */ 4643 if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 4644 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4645 4646 vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES); 4647 4648 /* 4649 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4650 * feature is exposed to the guest. This creates a virtualization hole 4651 * if both are supported in hardware but only one is exposed to the 4652 * guest, but letting the guest execute RDTSCP or RDPID when either one 4653 * is advertised is preferable to emulating the advertised instruction 4654 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4655 */ 4656 if (cpu_has_vmx_rdtscp()) { 4657 bool rdpid_or_rdtscp_enabled = 4658 guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || 4659 guest_cpuid_has(vcpu, X86_FEATURE_RDPID); 4660 4661 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4662 SECONDARY_EXEC_ENABLE_RDTSCP, 4663 rdpid_or_rdtscp_enabled, false); 4664 } 4665 4666 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4667 4668 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4669 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4670 4671 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4672 ENABLE_USR_WAIT_PAUSE, false); 4673 4674 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4675 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4676 4677 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4678 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4679 4680 return exec_control; 4681 } 4682 4683 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4684 { 4685 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4686 } 4687 4688 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4689 { 4690 struct page *pages; 4691 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4692 4693 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4694 return 0; 4695 4696 if (kvm_vmx->pid_table) 4697 return 0; 4698 4699 pages = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 4700 vmx_get_pid_table_order(kvm)); 4701 if (!pages) 4702 return -ENOMEM; 4703 4704 kvm_vmx->pid_table = (void *)page_address(pages); 4705 return 0; 4706 } 4707 4708 static int vmx_vcpu_precreate(struct kvm *kvm) 4709 { 4710 return vmx_alloc_ipiv_pid_table(kvm); 4711 } 4712 4713 #define VMX_XSS_EXIT_BITMAP 0 4714 4715 static void init_vmcs(struct vcpu_vmx *vmx) 4716 { 4717 struct kvm *kvm = vmx->vcpu.kvm; 4718 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4719 4720 if (nested) 4721 nested_vmx_set_vmcs_shadowing_bitmap(); 4722 4723 if (cpu_has_vmx_msr_bitmap()) 4724 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4725 4726 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4727 4728 /* Control */ 4729 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4730 4731 exec_controls_set(vmx, vmx_exec_control(vmx)); 4732 4733 if (cpu_has_secondary_exec_ctrls()) 4734 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4735 4736 if (cpu_has_tertiary_exec_ctrls()) 4737 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4738 4739 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4740 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4741 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4742 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4743 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4744 4745 vmcs_write16(GUEST_INTR_STATUS, 0); 4746 4747 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4748 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4749 } 4750 4751 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4752 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4753 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4754 } 4755 4756 if (!kvm_pause_in_guest(kvm)) { 4757 vmcs_write32(PLE_GAP, ple_gap); 4758 vmx->ple_window = ple_window; 4759 vmx->ple_window_dirty = true; 4760 } 4761 4762 if (kvm_notify_vmexit_enabled(kvm)) 4763 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4764 4765 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4766 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4767 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4768 4769 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4770 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4771 vmx_set_constant_host_state(vmx); 4772 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4773 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4774 4775 if (cpu_has_vmx_vmfunc()) 4776 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4777 4778 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4779 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4780 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4781 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4782 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4783 4784 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4785 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4786 4787 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4788 4789 /* 22.2.1, 20.8.1 */ 4790 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4791 4792 vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4793 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4794 4795 set_cr4_guest_host_mask(vmx); 4796 4797 if (vmx->vpid != 0) 4798 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4799 4800 if (cpu_has_vmx_xsaves()) 4801 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4802 4803 if (enable_pml) { 4804 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4805 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 4806 } 4807 4808 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4809 4810 if (vmx_pt_mode_is_host_guest()) { 4811 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4812 /* Bit[6~0] are forced to 1, writes are ignored. */ 4813 vmx->pt_desc.guest.output_mask = 0x7F; 4814 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4815 } 4816 4817 vmcs_write32(GUEST_SYSENTER_CS, 0); 4818 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4819 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4820 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4821 4822 if (cpu_has_vmx_tpr_shadow()) { 4823 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4824 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4825 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4826 __pa(vmx->vcpu.arch.apic->regs)); 4827 vmcs_write32(TPR_THRESHOLD, 0); 4828 } 4829 4830 vmx_setup_uret_msrs(vmx); 4831 } 4832 4833 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4834 { 4835 struct vcpu_vmx *vmx = to_vmx(vcpu); 4836 4837 init_vmcs(vmx); 4838 4839 if (nested) 4840 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4841 4842 vcpu_setup_sgx_lepubkeyhash(vcpu); 4843 4844 vmx->nested.posted_intr_nv = -1; 4845 vmx->nested.vmxon_ptr = INVALID_GPA; 4846 vmx->nested.current_vmptr = INVALID_GPA; 4847 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4848 4849 vcpu->arch.microcode_version = 0x100000000ULL; 4850 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4851 4852 /* 4853 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4854 * or POSTED_INTR_WAKEUP_VECTOR. 4855 */ 4856 vmx->pi_desc.nv = POSTED_INTR_VECTOR; 4857 vmx->pi_desc.sn = 1; 4858 } 4859 4860 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4861 { 4862 struct vcpu_vmx *vmx = to_vmx(vcpu); 4863 4864 if (!init_event) 4865 __vmx_vcpu_reset(vcpu); 4866 4867 vmx->rmode.vm86_active = 0; 4868 vmx->spec_ctrl = 0; 4869 4870 vmx->msr_ia32_umwait_control = 0; 4871 4872 vmx->hv_deadline_tsc = -1; 4873 kvm_set_cr8(vcpu, 0); 4874 4875 vmx_segment_cache_clear(vmx); 4876 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4877 4878 seg_setup(VCPU_SREG_CS); 4879 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4880 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4881 4882 seg_setup(VCPU_SREG_DS); 4883 seg_setup(VCPU_SREG_ES); 4884 seg_setup(VCPU_SREG_FS); 4885 seg_setup(VCPU_SREG_GS); 4886 seg_setup(VCPU_SREG_SS); 4887 4888 vmcs_write16(GUEST_TR_SELECTOR, 0); 4889 vmcs_writel(GUEST_TR_BASE, 0); 4890 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4891 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4892 4893 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4894 vmcs_writel(GUEST_LDTR_BASE, 0); 4895 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4896 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4897 4898 vmcs_writel(GUEST_GDTR_BASE, 0); 4899 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4900 4901 vmcs_writel(GUEST_IDTR_BASE, 0); 4902 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4903 4904 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4905 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4906 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4907 if (kvm_mpx_supported()) 4908 vmcs_write64(GUEST_BNDCFGS, 0); 4909 4910 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4911 4912 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4913 4914 vpid_sync_context(vmx->vpid); 4915 4916 vmx_update_fb_clear_dis(vcpu, vmx); 4917 } 4918 4919 static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 4920 { 4921 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4922 } 4923 4924 static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 4925 { 4926 if (!enable_vnmi || 4927 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4928 vmx_enable_irq_window(vcpu); 4929 return; 4930 } 4931 4932 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4933 } 4934 4935 static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 4936 { 4937 struct vcpu_vmx *vmx = to_vmx(vcpu); 4938 uint32_t intr; 4939 int irq = vcpu->arch.interrupt.nr; 4940 4941 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 4942 4943 ++vcpu->stat.irq_injections; 4944 if (vmx->rmode.vm86_active) { 4945 int inc_eip = 0; 4946 if (vcpu->arch.interrupt.soft) 4947 inc_eip = vcpu->arch.event_exit_inst_len; 4948 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4949 return; 4950 } 4951 intr = irq | INTR_INFO_VALID_MASK; 4952 if (vcpu->arch.interrupt.soft) { 4953 intr |= INTR_TYPE_SOFT_INTR; 4954 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4955 vmx->vcpu.arch.event_exit_inst_len); 4956 } else 4957 intr |= INTR_TYPE_EXT_INTR; 4958 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4959 4960 vmx_clear_hlt(vcpu); 4961 } 4962 4963 static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4964 { 4965 struct vcpu_vmx *vmx = to_vmx(vcpu); 4966 4967 if (!enable_vnmi) { 4968 /* 4969 * Tracking the NMI-blocked state in software is built upon 4970 * finding the next open IRQ window. This, in turn, depends on 4971 * well-behaving guests: They have to keep IRQs disabled at 4972 * least as long as the NMI handler runs. Otherwise we may 4973 * cause NMI nesting, maybe breaking the guest. But as this is 4974 * highly unlikely, we can live with the residual risk. 4975 */ 4976 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 4977 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4978 } 4979 4980 ++vcpu->stat.nmi_injections; 4981 vmx->loaded_vmcs->nmi_known_unmasked = false; 4982 4983 if (vmx->rmode.vm86_active) { 4984 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 4985 return; 4986 } 4987 4988 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4989 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4990 4991 vmx_clear_hlt(vcpu); 4992 } 4993 4994 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4995 { 4996 struct vcpu_vmx *vmx = to_vmx(vcpu); 4997 bool masked; 4998 4999 if (!enable_vnmi) 5000 return vmx->loaded_vmcs->soft_vnmi_blocked; 5001 if (vmx->loaded_vmcs->nmi_known_unmasked) 5002 return false; 5003 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 5004 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5005 return masked; 5006 } 5007 5008 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 5009 { 5010 struct vcpu_vmx *vmx = to_vmx(vcpu); 5011 5012 if (!enable_vnmi) { 5013 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 5014 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 5015 vmx->loaded_vmcs->vnmi_blocked_time = 0; 5016 } 5017 } else { 5018 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 5019 if (masked) 5020 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5021 GUEST_INTR_STATE_NMI); 5022 else 5023 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 5024 GUEST_INTR_STATE_NMI); 5025 } 5026 } 5027 5028 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 5029 { 5030 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5031 return false; 5032 5033 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 5034 return true; 5035 5036 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5037 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 5038 GUEST_INTR_STATE_NMI)); 5039 } 5040 5041 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5042 { 5043 if (to_vmx(vcpu)->nested.nested_run_pending) 5044 return -EBUSY; 5045 5046 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 5047 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 5048 return -EBUSY; 5049 5050 return !vmx_nmi_blocked(vcpu); 5051 } 5052 5053 bool __vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5054 { 5055 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 5056 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5057 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 5058 } 5059 5060 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 5061 { 5062 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5063 return false; 5064 5065 return __vmx_interrupt_blocked(vcpu); 5066 } 5067 5068 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 5069 { 5070 if (to_vmx(vcpu)->nested.nested_run_pending) 5071 return -EBUSY; 5072 5073 /* 5074 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 5075 * e.g. if the IRQ arrived asynchronously after checking nested events. 5076 */ 5077 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 5078 return -EBUSY; 5079 5080 return !vmx_interrupt_blocked(vcpu); 5081 } 5082 5083 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 5084 { 5085 void __user *ret; 5086 5087 if (enable_unrestricted_guest) 5088 return 0; 5089 5090 mutex_lock(&kvm->slots_lock); 5091 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 5092 PAGE_SIZE * 3); 5093 mutex_unlock(&kvm->slots_lock); 5094 5095 if (IS_ERR(ret)) 5096 return PTR_ERR(ret); 5097 5098 to_kvm_vmx(kvm)->tss_addr = addr; 5099 5100 return init_rmode_tss(kvm, ret); 5101 } 5102 5103 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 5104 { 5105 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 5106 return 0; 5107 } 5108 5109 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 5110 { 5111 switch (vec) { 5112 case BP_VECTOR: 5113 /* 5114 * Update instruction length as we may reinject the exception 5115 * from user space while in guest debugging mode. 5116 */ 5117 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5118 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5119 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5120 return false; 5121 fallthrough; 5122 case DB_VECTOR: 5123 return !(vcpu->guest_debug & 5124 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 5125 case DE_VECTOR: 5126 case OF_VECTOR: 5127 case BR_VECTOR: 5128 case UD_VECTOR: 5129 case DF_VECTOR: 5130 case SS_VECTOR: 5131 case GP_VECTOR: 5132 case MF_VECTOR: 5133 return true; 5134 } 5135 return false; 5136 } 5137 5138 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5139 int vec, u32 err_code) 5140 { 5141 /* 5142 * Instruction with address size override prefix opcode 0x67 5143 * Cause the #SS fault with 0 error code in VM86 mode. 5144 */ 5145 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5146 if (kvm_emulate_instruction(vcpu, 0)) { 5147 if (vcpu->arch.halt_request) { 5148 vcpu->arch.halt_request = 0; 5149 return kvm_emulate_halt_noskip(vcpu); 5150 } 5151 return 1; 5152 } 5153 return 0; 5154 } 5155 5156 /* 5157 * Forward all other exceptions that are valid in real mode. 5158 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5159 * the required debugging infrastructure rework. 5160 */ 5161 kvm_queue_exception(vcpu, vec); 5162 return 1; 5163 } 5164 5165 static int handle_machine_check(struct kvm_vcpu *vcpu) 5166 { 5167 /* handled by vmx_vcpu_run() */ 5168 return 1; 5169 } 5170 5171 /* 5172 * If the host has split lock detection disabled, then #AC is 5173 * unconditionally injected into the guest, which is the pre split lock 5174 * detection behaviour. 5175 * 5176 * If the host has split lock detection enabled then #AC is 5177 * only injected into the guest when: 5178 * - Guest CPL == 3 (user mode) 5179 * - Guest has #AC detection enabled in CR0 5180 * - Guest EFLAGS has AC bit set 5181 */ 5182 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5183 { 5184 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5185 return true; 5186 5187 return vmx_get_cpl(vcpu) == 3 && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) && 5188 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5189 } 5190 5191 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5192 { 5193 struct vcpu_vmx *vmx = to_vmx(vcpu); 5194 struct kvm_run *kvm_run = vcpu->run; 5195 u32 intr_info, ex_no, error_code; 5196 unsigned long cr2, dr6; 5197 u32 vect_info; 5198 5199 vect_info = vmx->idt_vectoring_info; 5200 intr_info = vmx_get_intr_info(vcpu); 5201 5202 /* 5203 * Machine checks are handled by handle_exception_irqoff(), or by 5204 * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by 5205 * vmx_vcpu_enter_exit(). 5206 */ 5207 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5208 return 1; 5209 5210 /* 5211 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5212 * This ensures the nested_vmx check is not skipped so vmexit can 5213 * be reflected to L1 (when it intercepts #NM) before reaching this 5214 * point. 5215 */ 5216 if (is_nm_fault(intr_info)) { 5217 kvm_queue_exception(vcpu, NM_VECTOR); 5218 return 1; 5219 } 5220 5221 if (is_invalid_opcode(intr_info)) 5222 return handle_ud(vcpu); 5223 5224 error_code = 0; 5225 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5226 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5227 5228 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5229 WARN_ON_ONCE(!enable_vmware_backdoor); 5230 5231 /* 5232 * VMware backdoor emulation on #GP interception only handles 5233 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5234 * error code on #GP. 5235 */ 5236 if (error_code) { 5237 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5238 return 1; 5239 } 5240 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5241 } 5242 5243 /* 5244 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5245 * MMIO, it is better to report an internal error. 5246 * See the comments in vmx_handle_exit. 5247 */ 5248 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5249 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5250 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5251 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5252 vcpu->run->internal.ndata = 4; 5253 vcpu->run->internal.data[0] = vect_info; 5254 vcpu->run->internal.data[1] = intr_info; 5255 vcpu->run->internal.data[2] = error_code; 5256 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5257 return 0; 5258 } 5259 5260 if (is_page_fault(intr_info)) { 5261 cr2 = vmx_get_exit_qual(vcpu); 5262 if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5263 /* 5264 * EPT will cause page fault only if we need to 5265 * detect illegal GPAs. 5266 */ 5267 WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5268 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5269 return 1; 5270 } else 5271 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5272 } 5273 5274 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5275 5276 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5277 return handle_rmode_exception(vcpu, ex_no, error_code); 5278 5279 switch (ex_no) { 5280 case DB_VECTOR: 5281 dr6 = vmx_get_exit_qual(vcpu); 5282 if (!(vcpu->guest_debug & 5283 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5284 /* 5285 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5286 * instruction. ICEBP generates a trap-like #DB, but 5287 * despite its interception control being tied to #DB, 5288 * is an instruction intercept, i.e. the VM-Exit occurs 5289 * on the ICEBP itself. Use the inner "skip" helper to 5290 * avoid single-step #DB and MTF updates, as ICEBP is 5291 * higher priority. Note, skipping ICEBP still clears 5292 * STI and MOVSS blocking. 5293 * 5294 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5295 * if single-step is enabled in RFLAGS and STI or MOVSS 5296 * blocking is active, as the CPU doesn't set the bit 5297 * on VM-Exit due to #DB interception. VM-Entry has a 5298 * consistency check that a single-step #DB is pending 5299 * in this scenario as the previous instruction cannot 5300 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5301 * don't modify RFLAGS), therefore the one instruction 5302 * delay when activating single-step breakpoints must 5303 * have already expired. Note, the CPU sets/clears BS 5304 * as appropriate for all other VM-Exits types. 5305 */ 5306 if (is_icebp(intr_info)) 5307 WARN_ON(!skip_emulated_instruction(vcpu)); 5308 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5309 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5310 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5311 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5312 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5313 5314 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5315 return 1; 5316 } 5317 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5318 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5319 fallthrough; 5320 case BP_VECTOR: 5321 /* 5322 * Update instruction length as we may reinject #BP from 5323 * user space while in guest debugging mode. Reading it for 5324 * #DB as well causes no harm, it is not used in that case. 5325 */ 5326 vmx->vcpu.arch.event_exit_inst_len = 5327 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5328 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5329 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5330 kvm_run->debug.arch.exception = ex_no; 5331 break; 5332 case AC_VECTOR: 5333 if (vmx_guest_inject_ac(vcpu)) { 5334 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5335 return 1; 5336 } 5337 5338 /* 5339 * Handle split lock. Depending on detection mode this will 5340 * either warn and disable split lock detection for this 5341 * task or force SIGBUS on it. 5342 */ 5343 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5344 return 1; 5345 fallthrough; 5346 default: 5347 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5348 kvm_run->ex.exception = ex_no; 5349 kvm_run->ex.error_code = error_code; 5350 break; 5351 } 5352 return 0; 5353 } 5354 5355 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5356 { 5357 ++vcpu->stat.irq_exits; 5358 return 1; 5359 } 5360 5361 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5362 { 5363 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5364 vcpu->mmio_needed = 0; 5365 return 0; 5366 } 5367 5368 static int handle_io(struct kvm_vcpu *vcpu) 5369 { 5370 unsigned long exit_qualification; 5371 int size, in, string; 5372 unsigned port; 5373 5374 exit_qualification = vmx_get_exit_qual(vcpu); 5375 string = (exit_qualification & 16) != 0; 5376 5377 ++vcpu->stat.io_exits; 5378 5379 if (string) 5380 return kvm_emulate_instruction(vcpu, 0); 5381 5382 port = exit_qualification >> 16; 5383 size = (exit_qualification & 7) + 1; 5384 in = (exit_qualification & 8) != 0; 5385 5386 return kvm_fast_pio(vcpu, size, port, in); 5387 } 5388 5389 static void 5390 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5391 { 5392 /* 5393 * Patch in the VMCALL instruction: 5394 */ 5395 hypercall[0] = 0x0f; 5396 hypercall[1] = 0x01; 5397 hypercall[2] = 0xc1; 5398 } 5399 5400 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5401 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5402 { 5403 if (is_guest_mode(vcpu)) { 5404 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5405 unsigned long orig_val = val; 5406 5407 /* 5408 * We get here when L2 changed cr0 in a way that did not change 5409 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5410 * but did change L0 shadowed bits. So we first calculate the 5411 * effective cr0 value that L1 would like to write into the 5412 * hardware. It consists of the L2-owned bits from the new 5413 * value combined with the L1-owned bits from L1's guest_cr0. 5414 */ 5415 val = (val & ~vmcs12->cr0_guest_host_mask) | 5416 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5417 5418 if (kvm_set_cr0(vcpu, val)) 5419 return 1; 5420 vmcs_writel(CR0_READ_SHADOW, orig_val); 5421 return 0; 5422 } else { 5423 return kvm_set_cr0(vcpu, val); 5424 } 5425 } 5426 5427 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5428 { 5429 if (is_guest_mode(vcpu)) { 5430 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5431 unsigned long orig_val = val; 5432 5433 /* analogously to handle_set_cr0 */ 5434 val = (val & ~vmcs12->cr4_guest_host_mask) | 5435 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5436 if (kvm_set_cr4(vcpu, val)) 5437 return 1; 5438 vmcs_writel(CR4_READ_SHADOW, orig_val); 5439 return 0; 5440 } else 5441 return kvm_set_cr4(vcpu, val); 5442 } 5443 5444 static int handle_desc(struct kvm_vcpu *vcpu) 5445 { 5446 /* 5447 * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this 5448 * and other code needs to be updated if UMIP can be guest owned. 5449 */ 5450 BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); 5451 5452 WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); 5453 return kvm_emulate_instruction(vcpu, 0); 5454 } 5455 5456 static int handle_cr(struct kvm_vcpu *vcpu) 5457 { 5458 unsigned long exit_qualification, val; 5459 int cr; 5460 int reg; 5461 int err; 5462 int ret; 5463 5464 exit_qualification = vmx_get_exit_qual(vcpu); 5465 cr = exit_qualification & 15; 5466 reg = (exit_qualification >> 8) & 15; 5467 switch ((exit_qualification >> 4) & 3) { 5468 case 0: /* mov to cr */ 5469 val = kvm_register_read(vcpu, reg); 5470 trace_kvm_cr_write(cr, val); 5471 switch (cr) { 5472 case 0: 5473 err = handle_set_cr0(vcpu, val); 5474 return kvm_complete_insn_gp(vcpu, err); 5475 case 3: 5476 WARN_ON_ONCE(enable_unrestricted_guest); 5477 5478 err = kvm_set_cr3(vcpu, val); 5479 return kvm_complete_insn_gp(vcpu, err); 5480 case 4: 5481 err = handle_set_cr4(vcpu, val); 5482 return kvm_complete_insn_gp(vcpu, err); 5483 case 8: { 5484 u8 cr8_prev = kvm_get_cr8(vcpu); 5485 u8 cr8 = (u8)val; 5486 err = kvm_set_cr8(vcpu, cr8); 5487 ret = kvm_complete_insn_gp(vcpu, err); 5488 if (lapic_in_kernel(vcpu)) 5489 return ret; 5490 if (cr8_prev <= cr8) 5491 return ret; 5492 /* 5493 * TODO: we might be squashing a 5494 * KVM_GUESTDBG_SINGLESTEP-triggered 5495 * KVM_EXIT_DEBUG here. 5496 */ 5497 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5498 return 0; 5499 } 5500 } 5501 break; 5502 case 2: /* clts */ 5503 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5504 return -EIO; 5505 case 1: /*mov from cr*/ 5506 switch (cr) { 5507 case 3: 5508 WARN_ON_ONCE(enable_unrestricted_guest); 5509 5510 val = kvm_read_cr3(vcpu); 5511 kvm_register_write(vcpu, reg, val); 5512 trace_kvm_cr_read(cr, val); 5513 return kvm_skip_emulated_instruction(vcpu); 5514 case 8: 5515 val = kvm_get_cr8(vcpu); 5516 kvm_register_write(vcpu, reg, val); 5517 trace_kvm_cr_read(cr, val); 5518 return kvm_skip_emulated_instruction(vcpu); 5519 } 5520 break; 5521 case 3: /* lmsw */ 5522 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5523 trace_kvm_cr_write(0, (kvm_read_cr0_bits(vcpu, ~0xful) | val)); 5524 kvm_lmsw(vcpu, val); 5525 5526 return kvm_skip_emulated_instruction(vcpu); 5527 default: 5528 break; 5529 } 5530 vcpu->run->exit_reason = 0; 5531 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5532 (int)(exit_qualification >> 4) & 3, cr); 5533 return 0; 5534 } 5535 5536 static int handle_dr(struct kvm_vcpu *vcpu) 5537 { 5538 unsigned long exit_qualification; 5539 int dr, dr7, reg; 5540 int err = 1; 5541 5542 exit_qualification = vmx_get_exit_qual(vcpu); 5543 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5544 5545 /* First, if DR does not exist, trigger UD */ 5546 if (!kvm_require_dr(vcpu, dr)) 5547 return 1; 5548 5549 if (vmx_get_cpl(vcpu) > 0) 5550 goto out; 5551 5552 dr7 = vmcs_readl(GUEST_DR7); 5553 if (dr7 & DR7_GD) { 5554 /* 5555 * As the vm-exit takes precedence over the debug trap, we 5556 * need to emulate the latter, either for the host or the 5557 * guest debugging itself. 5558 */ 5559 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5560 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5561 vcpu->run->debug.arch.dr7 = dr7; 5562 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5563 vcpu->run->debug.arch.exception = DB_VECTOR; 5564 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5565 return 0; 5566 } else { 5567 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5568 return 1; 5569 } 5570 } 5571 5572 if (vcpu->guest_debug == 0) { 5573 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5574 5575 /* 5576 * No more DR vmexits; force a reload of the debug registers 5577 * and reenter on this instruction. The next vmexit will 5578 * retrieve the full state of the debug registers. 5579 */ 5580 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5581 return 1; 5582 } 5583 5584 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5585 if (exit_qualification & TYPE_MOV_FROM_DR) { 5586 unsigned long val; 5587 5588 kvm_get_dr(vcpu, dr, &val); 5589 kvm_register_write(vcpu, reg, val); 5590 err = 0; 5591 } else { 5592 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5593 } 5594 5595 out: 5596 return kvm_complete_insn_gp(vcpu, err); 5597 } 5598 5599 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5600 { 5601 get_debugreg(vcpu->arch.db[0], 0); 5602 get_debugreg(vcpu->arch.db[1], 1); 5603 get_debugreg(vcpu->arch.db[2], 2); 5604 get_debugreg(vcpu->arch.db[3], 3); 5605 get_debugreg(vcpu->arch.dr6, 6); 5606 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5607 5608 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5609 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5610 5611 /* 5612 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5613 * a stale dr6 from the guest. 5614 */ 5615 set_debugreg(DR6_RESERVED, 6); 5616 } 5617 5618 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5619 { 5620 vmcs_writel(GUEST_DR7, val); 5621 } 5622 5623 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5624 { 5625 kvm_apic_update_ppr(vcpu); 5626 return 1; 5627 } 5628 5629 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5630 { 5631 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5632 5633 kvm_make_request(KVM_REQ_EVENT, vcpu); 5634 5635 ++vcpu->stat.irq_window_exits; 5636 return 1; 5637 } 5638 5639 static int handle_invlpg(struct kvm_vcpu *vcpu) 5640 { 5641 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5642 5643 kvm_mmu_invlpg(vcpu, exit_qualification); 5644 return kvm_skip_emulated_instruction(vcpu); 5645 } 5646 5647 static int handle_apic_access(struct kvm_vcpu *vcpu) 5648 { 5649 if (likely(fasteoi)) { 5650 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5651 int access_type, offset; 5652 5653 access_type = exit_qualification & APIC_ACCESS_TYPE; 5654 offset = exit_qualification & APIC_ACCESS_OFFSET; 5655 /* 5656 * Sane guest uses MOV to write EOI, with written value 5657 * not cared. So make a short-circuit here by avoiding 5658 * heavy instruction emulation. 5659 */ 5660 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5661 (offset == APIC_EOI)) { 5662 kvm_lapic_set_eoi(vcpu); 5663 return kvm_skip_emulated_instruction(vcpu); 5664 } 5665 } 5666 return kvm_emulate_instruction(vcpu, 0); 5667 } 5668 5669 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5670 { 5671 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5672 int vector = exit_qualification & 0xff; 5673 5674 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5675 kvm_apic_set_eoi_accelerated(vcpu, vector); 5676 return 1; 5677 } 5678 5679 static int handle_apic_write(struct kvm_vcpu *vcpu) 5680 { 5681 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5682 5683 /* 5684 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5685 * hardware has done any necessary aliasing, offset adjustments, etc... 5686 * for the access. I.e. the correct value has already been written to 5687 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5688 * retrieve the register value and emulate the access. 5689 */ 5690 u32 offset = exit_qualification & 0xff0; 5691 5692 kvm_apic_write_nodecode(vcpu, offset); 5693 return 1; 5694 } 5695 5696 static int handle_task_switch(struct kvm_vcpu *vcpu) 5697 { 5698 struct vcpu_vmx *vmx = to_vmx(vcpu); 5699 unsigned long exit_qualification; 5700 bool has_error_code = false; 5701 u32 error_code = 0; 5702 u16 tss_selector; 5703 int reason, type, idt_v, idt_index; 5704 5705 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5706 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5707 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5708 5709 exit_qualification = vmx_get_exit_qual(vcpu); 5710 5711 reason = (u32)exit_qualification >> 30; 5712 if (reason == TASK_SWITCH_GATE && idt_v) { 5713 switch (type) { 5714 case INTR_TYPE_NMI_INTR: 5715 vcpu->arch.nmi_injected = false; 5716 vmx_set_nmi_mask(vcpu, true); 5717 break; 5718 case INTR_TYPE_EXT_INTR: 5719 case INTR_TYPE_SOFT_INTR: 5720 kvm_clear_interrupt_queue(vcpu); 5721 break; 5722 case INTR_TYPE_HARD_EXCEPTION: 5723 if (vmx->idt_vectoring_info & 5724 VECTORING_INFO_DELIVER_CODE_MASK) { 5725 has_error_code = true; 5726 error_code = 5727 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5728 } 5729 fallthrough; 5730 case INTR_TYPE_SOFT_EXCEPTION: 5731 kvm_clear_exception_queue(vcpu); 5732 break; 5733 default: 5734 break; 5735 } 5736 } 5737 tss_selector = exit_qualification; 5738 5739 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5740 type != INTR_TYPE_EXT_INTR && 5741 type != INTR_TYPE_NMI_INTR)) 5742 WARN_ON(!skip_emulated_instruction(vcpu)); 5743 5744 /* 5745 * TODO: What about debug traps on tss switch? 5746 * Are we supposed to inject them and update dr6? 5747 */ 5748 return kvm_task_switch(vcpu, tss_selector, 5749 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5750 reason, has_error_code, error_code); 5751 } 5752 5753 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5754 { 5755 unsigned long exit_qualification; 5756 gpa_t gpa; 5757 u64 error_code; 5758 5759 exit_qualification = vmx_get_exit_qual(vcpu); 5760 5761 /* 5762 * EPT violation happened while executing iret from NMI, 5763 * "blocked by NMI" bit has to be set before next VM entry. 5764 * There are errata that may cause this bit to not be set: 5765 * AAK134, BY25. 5766 */ 5767 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5768 enable_vnmi && 5769 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5770 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5771 5772 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5773 trace_kvm_page_fault(vcpu, gpa, exit_qualification); 5774 5775 /* Is it a read fault? */ 5776 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) 5777 ? PFERR_USER_MASK : 0; 5778 /* Is it a write fault? */ 5779 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) 5780 ? PFERR_WRITE_MASK : 0; 5781 /* Is it a fetch fault? */ 5782 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) 5783 ? PFERR_FETCH_MASK : 0; 5784 /* ept page table entry is present? */ 5785 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) 5786 ? PFERR_PRESENT_MASK : 0; 5787 5788 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? 5789 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5790 5791 vcpu->arch.exit_qualification = exit_qualification; 5792 5793 /* 5794 * Check that the GPA doesn't exceed physical memory limits, as that is 5795 * a guest page fault. We have to emulate the instruction here, because 5796 * if the illegal address is that of a paging structure, then 5797 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5798 * would also use advanced VM-exit information for EPT violations to 5799 * reconstruct the page fault error code. 5800 */ 5801 if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa))) 5802 return kvm_emulate_instruction(vcpu, 0); 5803 5804 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5805 } 5806 5807 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5808 { 5809 gpa_t gpa; 5810 5811 if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5812 return 1; 5813 5814 /* 5815 * A nested guest cannot optimize MMIO vmexits, because we have an 5816 * nGPA here instead of the required GPA. 5817 */ 5818 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5819 if (!is_guest_mode(vcpu) && 5820 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5821 trace_kvm_fast_mmio(gpa); 5822 return kvm_skip_emulated_instruction(vcpu); 5823 } 5824 5825 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5826 } 5827 5828 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5829 { 5830 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5831 return -EIO; 5832 5833 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5834 ++vcpu->stat.nmi_window_exits; 5835 kvm_make_request(KVM_REQ_EVENT, vcpu); 5836 5837 return 1; 5838 } 5839 5840 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) 5841 { 5842 struct vcpu_vmx *vmx = to_vmx(vcpu); 5843 5844 return vmx->emulation_required && !vmx->rmode.vm86_active && 5845 (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected); 5846 } 5847 5848 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5849 { 5850 struct vcpu_vmx *vmx = to_vmx(vcpu); 5851 bool intr_window_requested; 5852 unsigned count = 130; 5853 5854 intr_window_requested = exec_controls_get(vmx) & 5855 CPU_BASED_INTR_WINDOW_EXITING; 5856 5857 while (vmx->emulation_required && count-- != 0) { 5858 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 5859 return handle_interrupt_window(&vmx->vcpu); 5860 5861 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5862 return 1; 5863 5864 if (!kvm_emulate_instruction(vcpu, 0)) 5865 return 0; 5866 5867 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5868 kvm_prepare_emulation_failure_exit(vcpu); 5869 return 0; 5870 } 5871 5872 if (vcpu->arch.halt_request) { 5873 vcpu->arch.halt_request = 0; 5874 return kvm_emulate_halt_noskip(vcpu); 5875 } 5876 5877 /* 5878 * Note, return 1 and not 0, vcpu_run() will invoke 5879 * xfer_to_guest_mode() which will create a proper return 5880 * code. 5881 */ 5882 if (__xfer_to_guest_mode_work_pending()) 5883 return 1; 5884 } 5885 5886 return 1; 5887 } 5888 5889 static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5890 { 5891 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5892 kvm_prepare_emulation_failure_exit(vcpu); 5893 return 0; 5894 } 5895 5896 return 1; 5897 } 5898 5899 static void grow_ple_window(struct kvm_vcpu *vcpu) 5900 { 5901 struct vcpu_vmx *vmx = to_vmx(vcpu); 5902 unsigned int old = vmx->ple_window; 5903 5904 vmx->ple_window = __grow_ple_window(old, ple_window, 5905 ple_window_grow, 5906 ple_window_max); 5907 5908 if (vmx->ple_window != old) { 5909 vmx->ple_window_dirty = true; 5910 trace_kvm_ple_window_update(vcpu->vcpu_id, 5911 vmx->ple_window, old); 5912 } 5913 } 5914 5915 static void shrink_ple_window(struct kvm_vcpu *vcpu) 5916 { 5917 struct vcpu_vmx *vmx = to_vmx(vcpu); 5918 unsigned int old = vmx->ple_window; 5919 5920 vmx->ple_window = __shrink_ple_window(old, ple_window, 5921 ple_window_shrink, 5922 ple_window); 5923 5924 if (vmx->ple_window != old) { 5925 vmx->ple_window_dirty = true; 5926 trace_kvm_ple_window_update(vcpu->vcpu_id, 5927 vmx->ple_window, old); 5928 } 5929 } 5930 5931 /* 5932 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5933 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5934 */ 5935 static int handle_pause(struct kvm_vcpu *vcpu) 5936 { 5937 if (!kvm_pause_in_guest(vcpu->kvm)) 5938 grow_ple_window(vcpu); 5939 5940 /* 5941 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5942 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5943 * never set PAUSE_EXITING and just set PLE if supported, 5944 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5945 */ 5946 kvm_vcpu_on_spin(vcpu, true); 5947 return kvm_skip_emulated_instruction(vcpu); 5948 } 5949 5950 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5951 { 5952 return 1; 5953 } 5954 5955 static int handle_invpcid(struct kvm_vcpu *vcpu) 5956 { 5957 u32 vmx_instruction_info; 5958 unsigned long type; 5959 gva_t gva; 5960 struct { 5961 u64 pcid; 5962 u64 gla; 5963 } operand; 5964 int gpr_index; 5965 5966 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 5967 kvm_queue_exception(vcpu, UD_VECTOR); 5968 return 1; 5969 } 5970 5971 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5972 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5973 type = kvm_register_read(vcpu, gpr_index); 5974 5975 /* According to the Intel instruction reference, the memory operand 5976 * is read even if it isn't needed (e.g., for type==all) 5977 */ 5978 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5979 vmx_instruction_info, false, 5980 sizeof(operand), &gva)) 5981 return 1; 5982 5983 return kvm_handle_invpcid(vcpu, type, gva); 5984 } 5985 5986 static int handle_pml_full(struct kvm_vcpu *vcpu) 5987 { 5988 unsigned long exit_qualification; 5989 5990 trace_kvm_pml_full(vcpu->vcpu_id); 5991 5992 exit_qualification = vmx_get_exit_qual(vcpu); 5993 5994 /* 5995 * PML buffer FULL happened while executing iret from NMI, 5996 * "blocked by NMI" bit has to be set before next VM entry. 5997 */ 5998 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5999 enable_vnmi && 6000 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 6001 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6002 GUEST_INTR_STATE_NMI); 6003 6004 /* 6005 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 6006 * here.., and there's no userspace involvement needed for PML. 6007 */ 6008 return 1; 6009 } 6010 6011 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) 6012 { 6013 struct vcpu_vmx *vmx = to_vmx(vcpu); 6014 6015 if (!vmx->req_immediate_exit && 6016 !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { 6017 kvm_lapic_expired_hv_timer(vcpu); 6018 return EXIT_FASTPATH_REENTER_GUEST; 6019 } 6020 6021 return EXIT_FASTPATH_NONE; 6022 } 6023 6024 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 6025 { 6026 handle_fastpath_preemption_timer(vcpu); 6027 return 1; 6028 } 6029 6030 /* 6031 * When nested=0, all VMX instruction VM Exits filter here. The handlers 6032 * are overwritten by nested_vmx_setup() when nested=1. 6033 */ 6034 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 6035 { 6036 kvm_queue_exception(vcpu, UD_VECTOR); 6037 return 1; 6038 } 6039 6040 #ifndef CONFIG_X86_SGX_KVM 6041 static int handle_encls(struct kvm_vcpu *vcpu) 6042 { 6043 /* 6044 * SGX virtualization is disabled. There is no software enable bit for 6045 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 6046 * the guest from executing ENCLS (when SGX is supported by hardware). 6047 */ 6048 kvm_queue_exception(vcpu, UD_VECTOR); 6049 return 1; 6050 } 6051 #endif /* CONFIG_X86_SGX_KVM */ 6052 6053 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 6054 { 6055 /* 6056 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 6057 * VM-Exits. Unconditionally set the flag here and leave the handling to 6058 * vmx_handle_exit(). 6059 */ 6060 to_vmx(vcpu)->exit_reason.bus_lock_detected = true; 6061 return 1; 6062 } 6063 6064 static int handle_notify(struct kvm_vcpu *vcpu) 6065 { 6066 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 6067 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 6068 6069 ++vcpu->stat.notify_window_exits; 6070 6071 /* 6072 * Notify VM exit happened while executing iret from NMI, 6073 * "blocked by NMI" bit has to be set before next VM entry. 6074 */ 6075 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 6076 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6077 GUEST_INTR_STATE_NMI); 6078 6079 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 6080 context_invalid) { 6081 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 6082 vcpu->run->notify.flags = context_invalid ? 6083 KVM_NOTIFY_CONTEXT_INVALID : 0; 6084 return 0; 6085 } 6086 6087 return 1; 6088 } 6089 6090 /* 6091 * The exit handlers return 1 if the exit was handled fully and guest execution 6092 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 6093 * to be done to userspace and return 0. 6094 */ 6095 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 6096 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 6097 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 6098 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 6099 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 6100 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 6101 [EXIT_REASON_CR_ACCESS] = handle_cr, 6102 [EXIT_REASON_DR_ACCESS] = handle_dr, 6103 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 6104 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 6105 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 6106 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 6107 [EXIT_REASON_HLT] = kvm_emulate_halt, 6108 [EXIT_REASON_INVD] = kvm_emulate_invd, 6109 [EXIT_REASON_INVLPG] = handle_invlpg, 6110 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 6111 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 6112 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 6113 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 6114 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 6115 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 6116 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 6117 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 6118 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 6119 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 6120 [EXIT_REASON_VMON] = handle_vmx_instruction, 6121 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 6122 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 6123 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 6124 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 6125 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 6126 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 6127 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 6128 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 6129 [EXIT_REASON_GDTR_IDTR] = handle_desc, 6130 [EXIT_REASON_LDTR_TR] = handle_desc, 6131 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6132 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6133 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6134 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6135 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6136 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6137 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6138 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6139 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6140 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6141 [EXIT_REASON_PML_FULL] = handle_pml_full, 6142 [EXIT_REASON_INVPCID] = handle_invpcid, 6143 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6144 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6145 [EXIT_REASON_ENCLS] = handle_encls, 6146 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6147 [EXIT_REASON_NOTIFY] = handle_notify, 6148 }; 6149 6150 static const int kvm_vmx_max_exit_handlers = 6151 ARRAY_SIZE(kvm_vmx_exit_handlers); 6152 6153 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6154 u64 *info1, u64 *info2, 6155 u32 *intr_info, u32 *error_code) 6156 { 6157 struct vcpu_vmx *vmx = to_vmx(vcpu); 6158 6159 *reason = vmx->exit_reason.full; 6160 *info1 = vmx_get_exit_qual(vcpu); 6161 if (!(vmx->exit_reason.failed_vmentry)) { 6162 *info2 = vmx->idt_vectoring_info; 6163 *intr_info = vmx_get_intr_info(vcpu); 6164 if (is_exception_with_error_code(*intr_info)) 6165 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6166 else 6167 *error_code = 0; 6168 } else { 6169 *info2 = 0; 6170 *intr_info = 0; 6171 *error_code = 0; 6172 } 6173 } 6174 6175 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6176 { 6177 if (vmx->pml_pg) { 6178 __free_page(vmx->pml_pg); 6179 vmx->pml_pg = NULL; 6180 } 6181 } 6182 6183 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6184 { 6185 struct vcpu_vmx *vmx = to_vmx(vcpu); 6186 u64 *pml_buf; 6187 u16 pml_idx; 6188 6189 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6190 6191 /* Do nothing if PML buffer is empty */ 6192 if (pml_idx == (PML_ENTITY_NUM - 1)) 6193 return; 6194 6195 /* PML index always points to next available PML buffer entity */ 6196 if (pml_idx >= PML_ENTITY_NUM) 6197 pml_idx = 0; 6198 else 6199 pml_idx++; 6200 6201 pml_buf = page_address(vmx->pml_pg); 6202 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 6203 u64 gpa; 6204 6205 gpa = pml_buf[pml_idx]; 6206 WARN_ON(gpa & (PAGE_SIZE - 1)); 6207 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6208 } 6209 6210 /* reset PML index */ 6211 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 6212 } 6213 6214 static void vmx_dump_sel(char *name, uint32_t sel) 6215 { 6216 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6217 name, vmcs_read16(sel), 6218 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6219 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6220 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6221 } 6222 6223 static void vmx_dump_dtsel(char *name, uint32_t limit) 6224 { 6225 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6226 name, vmcs_read32(limit), 6227 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6228 } 6229 6230 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6231 { 6232 unsigned int i; 6233 struct vmx_msr_entry *e; 6234 6235 pr_err("MSR %s:\n", name); 6236 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6237 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6238 } 6239 6240 void dump_vmcs(struct kvm_vcpu *vcpu) 6241 { 6242 struct vcpu_vmx *vmx = to_vmx(vcpu); 6243 u32 vmentry_ctl, vmexit_ctl; 6244 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6245 u64 tertiary_exec_control; 6246 unsigned long cr4; 6247 int efer_slot; 6248 6249 if (!dump_invalid_vmcs) { 6250 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6251 return; 6252 } 6253 6254 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6255 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6256 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6257 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6258 cr4 = vmcs_readl(GUEST_CR4); 6259 6260 if (cpu_has_secondary_exec_ctrls()) 6261 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6262 else 6263 secondary_exec_control = 0; 6264 6265 if (cpu_has_tertiary_exec_ctrls()) 6266 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6267 else 6268 tertiary_exec_control = 0; 6269 6270 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6271 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6272 pr_err("*** Guest State ***\n"); 6273 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6274 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6275 vmcs_readl(CR0_GUEST_HOST_MASK)); 6276 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6277 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6278 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6279 if (cpu_has_vmx_ept()) { 6280 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6281 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6282 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6283 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6284 } 6285 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6286 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6287 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6288 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6289 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6290 vmcs_readl(GUEST_SYSENTER_ESP), 6291 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6292 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6293 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6294 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6295 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6296 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6297 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6298 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6299 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6300 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6301 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6302 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6303 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6304 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6305 else if (efer_slot >= 0) 6306 pr_err("EFER= 0x%016llx (autoload)\n", 6307 vmx->msr_autoload.guest.val[efer_slot].value); 6308 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6309 pr_err("EFER= 0x%016llx (effective)\n", 6310 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6311 else 6312 pr_err("EFER= 0x%016llx (effective)\n", 6313 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6314 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6315 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6316 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6317 vmcs_read64(GUEST_IA32_DEBUGCTL), 6318 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6319 if (cpu_has_load_perf_global_ctrl() && 6320 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6321 pr_err("PerfGlobCtl = 0x%016llx\n", 6322 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6323 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6324 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6325 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6326 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6327 vmcs_read32(GUEST_ACTIVITY_STATE)); 6328 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6329 pr_err("InterruptStatus = %04x\n", 6330 vmcs_read16(GUEST_INTR_STATUS)); 6331 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6332 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6333 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6334 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6335 6336 pr_err("*** Host State ***\n"); 6337 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6338 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6339 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6340 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6341 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6342 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6343 vmcs_read16(HOST_TR_SELECTOR)); 6344 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6345 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6346 vmcs_readl(HOST_TR_BASE)); 6347 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6348 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6349 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6350 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6351 vmcs_readl(HOST_CR4)); 6352 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6353 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6354 vmcs_read32(HOST_IA32_SYSENTER_CS), 6355 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6356 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6357 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6358 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6359 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6360 if (cpu_has_load_perf_global_ctrl() && 6361 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6362 pr_err("PerfGlobCtl = 0x%016llx\n", 6363 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6364 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6365 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6366 6367 pr_err("*** Control State ***\n"); 6368 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6369 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6370 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6371 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6372 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6373 vmcs_read32(EXCEPTION_BITMAP), 6374 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6375 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6376 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6377 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6378 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6379 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6380 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6381 vmcs_read32(VM_EXIT_INTR_INFO), 6382 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6383 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6384 pr_err(" reason=%08x qualification=%016lx\n", 6385 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6386 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6387 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6388 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6389 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6390 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6391 pr_err("TSC Multiplier = 0x%016llx\n", 6392 vmcs_read64(TSC_MULTIPLIER)); 6393 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6394 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6395 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6396 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6397 } 6398 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6399 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6400 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6401 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6402 } 6403 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6404 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6405 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6406 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6407 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6408 pr_err("PLE Gap=%08x Window=%08x\n", 6409 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6410 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6411 pr_err("Virtual processor ID = 0x%04x\n", 6412 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6413 } 6414 6415 /* 6416 * The guest has exited. See if we can fix it or if we need userspace 6417 * assistance. 6418 */ 6419 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6420 { 6421 struct vcpu_vmx *vmx = to_vmx(vcpu); 6422 union vmx_exit_reason exit_reason = vmx->exit_reason; 6423 u32 vectoring_info = vmx->idt_vectoring_info; 6424 u16 exit_handler_index; 6425 6426 /* 6427 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6428 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6429 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6430 * mode as if vcpus is in root mode, the PML buffer must has been 6431 * flushed already. Note, PML is never enabled in hardware while 6432 * running L2. 6433 */ 6434 if (enable_pml && !is_guest_mode(vcpu)) 6435 vmx_flush_pml_buffer(vcpu); 6436 6437 /* 6438 * KVM should never reach this point with a pending nested VM-Enter. 6439 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6440 * invalid guest state should never happen as that means KVM knowingly 6441 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6442 */ 6443 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6444 return -EIO; 6445 6446 if (is_guest_mode(vcpu)) { 6447 /* 6448 * PML is never enabled when running L2, bail immediately if a 6449 * PML full exit occurs as something is horribly wrong. 6450 */ 6451 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6452 goto unexpected_vmexit; 6453 6454 /* 6455 * The host physical addresses of some pages of guest memory 6456 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6457 * Page). The CPU may write to these pages via their host 6458 * physical address while L2 is running, bypassing any 6459 * address-translation-based dirty tracking (e.g. EPT write 6460 * protection). 6461 * 6462 * Mark them dirty on every exit from L2 to prevent them from 6463 * getting out of sync with dirty tracking. 6464 */ 6465 nested_mark_vmcs12_pages_dirty(vcpu); 6466 6467 /* 6468 * Synthesize a triple fault if L2 state is invalid. In normal 6469 * operation, nested VM-Enter rejects any attempt to enter L2 6470 * with invalid state. However, those checks are skipped if 6471 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6472 * L2 state is invalid, it means either L1 modified SMRAM state 6473 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6474 * doing so is architecturally allowed in the RSM case, and is 6475 * the least awful solution for the userspace case without 6476 * risking false positives. 6477 */ 6478 if (vmx->emulation_required) { 6479 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6480 return 1; 6481 } 6482 6483 if (nested_vmx_reflect_vmexit(vcpu)) 6484 return 1; 6485 } 6486 6487 /* If guest state is invalid, start emulating. L2 is handled above. */ 6488 if (vmx->emulation_required) 6489 return handle_invalid_guest_state(vcpu); 6490 6491 if (exit_reason.failed_vmentry) { 6492 dump_vmcs(vcpu); 6493 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6494 vcpu->run->fail_entry.hardware_entry_failure_reason 6495 = exit_reason.full; 6496 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6497 return 0; 6498 } 6499 6500 if (unlikely(vmx->fail)) { 6501 dump_vmcs(vcpu); 6502 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6503 vcpu->run->fail_entry.hardware_entry_failure_reason 6504 = vmcs_read32(VM_INSTRUCTION_ERROR); 6505 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6506 return 0; 6507 } 6508 6509 /* 6510 * Note: 6511 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 6512 * delivery event since it indicates guest is accessing MMIO. 6513 * The vm-exit can be triggered again after return to guest that 6514 * will cause infinite loop. 6515 */ 6516 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6517 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6518 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6519 exit_reason.basic != EXIT_REASON_PML_FULL && 6520 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6521 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6522 exit_reason.basic != EXIT_REASON_NOTIFY)) { 6523 int ndata = 3; 6524 6525 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6526 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 6527 vcpu->run->internal.data[0] = vectoring_info; 6528 vcpu->run->internal.data[1] = exit_reason.full; 6529 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; 6530 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { 6531 vcpu->run->internal.data[ndata++] = 6532 vmcs_read64(GUEST_PHYSICAL_ADDRESS); 6533 } 6534 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; 6535 vcpu->run->internal.ndata = ndata; 6536 return 0; 6537 } 6538 6539 if (unlikely(!enable_vnmi && 6540 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6541 if (!vmx_interrupt_blocked(vcpu)) { 6542 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6543 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6544 vcpu->arch.nmi_pending) { 6545 /* 6546 * This CPU don't support us in finding the end of an 6547 * NMI-blocked window if the guest runs with IRQs 6548 * disabled. So we pull the trigger after 1 s of 6549 * futile waiting, but inform the user about this. 6550 */ 6551 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6552 "state on VCPU %d after 1 s timeout\n", 6553 __func__, vcpu->vcpu_id); 6554 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6555 } 6556 } 6557 6558 if (exit_fastpath != EXIT_FASTPATH_NONE) 6559 return 1; 6560 6561 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6562 goto unexpected_vmexit; 6563 #ifdef CONFIG_RETPOLINE 6564 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6565 return kvm_emulate_wrmsr(vcpu); 6566 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6567 return handle_preemption_timer(vcpu); 6568 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6569 return handle_interrupt_window(vcpu); 6570 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6571 return handle_external_interrupt(vcpu); 6572 else if (exit_reason.basic == EXIT_REASON_HLT) 6573 return kvm_emulate_halt(vcpu); 6574 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6575 return handle_ept_misconfig(vcpu); 6576 #endif 6577 6578 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6579 kvm_vmx_max_exit_handlers); 6580 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6581 goto unexpected_vmexit; 6582 6583 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6584 6585 unexpected_vmexit: 6586 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6587 exit_reason.full); 6588 dump_vmcs(vcpu); 6589 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6590 vcpu->run->internal.suberror = 6591 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6592 vcpu->run->internal.ndata = 2; 6593 vcpu->run->internal.data[0] = exit_reason.full; 6594 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6595 return 0; 6596 } 6597 6598 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6599 { 6600 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6601 6602 /* 6603 * Exit to user space when bus lock detected to inform that there is 6604 * a bus lock in guest. 6605 */ 6606 if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { 6607 if (ret > 0) 6608 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6609 6610 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6611 return 0; 6612 } 6613 return ret; 6614 } 6615 6616 /* 6617 * Software based L1D cache flush which is used when microcode providing 6618 * the cache control MSR is not loaded. 6619 * 6620 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6621 * flush it is required to read in 64 KiB because the replacement algorithm 6622 * is not exactly LRU. This could be sized at runtime via topology 6623 * information but as all relevant affected CPUs have 32KiB L1D cache size 6624 * there is no point in doing so. 6625 */ 6626 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6627 { 6628 int size = PAGE_SIZE << L1D_CACHE_ORDER; 6629 6630 /* 6631 * This code is only executed when the flush mode is 'cond' or 6632 * 'always' 6633 */ 6634 if (static_branch_likely(&vmx_l1d_flush_cond)) { 6635 bool flush_l1d; 6636 6637 /* 6638 * Clear the per-vcpu flush bit, it gets set again 6639 * either from vcpu_run() or from one of the unsafe 6640 * VMEXIT handlers. 6641 */ 6642 flush_l1d = vcpu->arch.l1tf_flush_l1d; 6643 vcpu->arch.l1tf_flush_l1d = false; 6644 6645 /* 6646 * Clear the per-cpu flush bit, it gets set again from 6647 * the interrupt handlers. 6648 */ 6649 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6650 kvm_clear_cpu_l1tf_flush_l1d(); 6651 6652 if (!flush_l1d) 6653 return; 6654 } 6655 6656 vcpu->stat.l1d_flush++; 6657 6658 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6659 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6660 return; 6661 } 6662 6663 asm volatile( 6664 /* First ensure the pages are in the TLB */ 6665 "xorl %%eax, %%eax\n" 6666 ".Lpopulate_tlb:\n\t" 6667 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6668 "addl $4096, %%eax\n\t" 6669 "cmpl %%eax, %[size]\n\t" 6670 "jne .Lpopulate_tlb\n\t" 6671 "xorl %%eax, %%eax\n\t" 6672 "cpuid\n\t" 6673 /* Now fill the cache */ 6674 "xorl %%eax, %%eax\n" 6675 ".Lfill_cache:\n" 6676 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6677 "addl $64, %%eax\n\t" 6678 "cmpl %%eax, %[size]\n\t" 6679 "jne .Lfill_cache\n\t" 6680 "lfence\n" 6681 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6682 [size] "r" (size) 6683 : "eax", "ebx", "ecx", "edx"); 6684 } 6685 6686 static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6687 { 6688 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6689 int tpr_threshold; 6690 6691 if (is_guest_mode(vcpu) && 6692 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6693 return; 6694 6695 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6696 if (is_guest_mode(vcpu)) 6697 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; 6698 else 6699 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6700 } 6701 6702 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6703 { 6704 struct vcpu_vmx *vmx = to_vmx(vcpu); 6705 u32 sec_exec_control; 6706 6707 if (!lapic_in_kernel(vcpu)) 6708 return; 6709 6710 if (!flexpriority_enabled && 6711 !cpu_has_vmx_virtualize_x2apic_mode()) 6712 return; 6713 6714 /* Postpone execution until vmcs01 is the current VMCS. */ 6715 if (is_guest_mode(vcpu)) { 6716 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6717 return; 6718 } 6719 6720 sec_exec_control = secondary_exec_controls_get(vmx); 6721 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6722 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6723 6724 switch (kvm_get_apic_mode(vcpu)) { 6725 case LAPIC_MODE_INVALID: 6726 WARN_ONCE(true, "Invalid local APIC state"); 6727 break; 6728 case LAPIC_MODE_DISABLED: 6729 break; 6730 case LAPIC_MODE_XAPIC: 6731 if (flexpriority_enabled) { 6732 sec_exec_control |= 6733 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6734 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6735 6736 /* 6737 * Flush the TLB, reloading the APIC access page will 6738 * only do so if its physical address has changed, but 6739 * the guest may have inserted a non-APIC mapping into 6740 * the TLB while the APIC access page was disabled. 6741 */ 6742 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6743 } 6744 break; 6745 case LAPIC_MODE_X2APIC: 6746 if (cpu_has_vmx_virtualize_x2apic_mode()) 6747 sec_exec_control |= 6748 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6749 break; 6750 } 6751 secondary_exec_controls_set(vmx, sec_exec_control); 6752 6753 vmx_update_msr_bitmap_x2apic(vcpu); 6754 } 6755 6756 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6757 { 6758 const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; 6759 struct kvm *kvm = vcpu->kvm; 6760 struct kvm_memslots *slots = kvm_memslots(kvm); 6761 struct kvm_memory_slot *slot; 6762 unsigned long mmu_seq; 6763 kvm_pfn_t pfn; 6764 6765 /* Defer reload until vmcs01 is the current VMCS. */ 6766 if (is_guest_mode(vcpu)) { 6767 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; 6768 return; 6769 } 6770 6771 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6772 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6773 return; 6774 6775 /* 6776 * Grab the memslot so that the hva lookup for the mmu_notifier retry 6777 * is guaranteed to use the same memslot as the pfn lookup, i.e. rely 6778 * on the pfn lookup's validation of the memslot to ensure a valid hva 6779 * is used for the retry check. 6780 */ 6781 slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); 6782 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 6783 return; 6784 6785 /* 6786 * Ensure that the mmu_notifier sequence count is read before KVM 6787 * retrieves the pfn from the primary MMU. Note, the memslot is 6788 * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() 6789 * in kvm_mmu_invalidate_end(). 6790 */ 6791 mmu_seq = kvm->mmu_invalidate_seq; 6792 smp_rmb(); 6793 6794 /* 6795 * No need to retry if the memslot does not exist or is invalid. KVM 6796 * controls the APIC-access page memslot, and only deletes the memslot 6797 * if APICv is permanently inhibited, i.e. the memslot won't reappear. 6798 */ 6799 pfn = gfn_to_pfn_memslot(slot, gfn); 6800 if (is_error_noslot_pfn(pfn)) 6801 return; 6802 6803 read_lock(&vcpu->kvm->mmu_lock); 6804 if (mmu_invalidate_retry_hva(kvm, mmu_seq, 6805 gfn_to_hva_memslot(slot, gfn))) { 6806 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6807 read_unlock(&vcpu->kvm->mmu_lock); 6808 goto out; 6809 } 6810 6811 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); 6812 read_unlock(&vcpu->kvm->mmu_lock); 6813 6814 /* 6815 * No need for a manual TLB flush at this point, KVM has already done a 6816 * flush if there were SPTEs pointing at the previous page. 6817 */ 6818 out: 6819 /* 6820 * Do not pin apic access page in memory, the MMU notifier 6821 * will call us again if it is migrated or swapped out. 6822 */ 6823 kvm_release_pfn_clean(pfn); 6824 } 6825 6826 static void vmx_hwapic_isr_update(int max_isr) 6827 { 6828 u16 status; 6829 u8 old; 6830 6831 if (max_isr == -1) 6832 max_isr = 0; 6833 6834 status = vmcs_read16(GUEST_INTR_STATUS); 6835 old = status >> 8; 6836 if (max_isr != old) { 6837 status &= 0xff; 6838 status |= max_isr << 8; 6839 vmcs_write16(GUEST_INTR_STATUS, status); 6840 } 6841 } 6842 6843 static void vmx_set_rvi(int vector) 6844 { 6845 u16 status; 6846 u8 old; 6847 6848 if (vector == -1) 6849 vector = 0; 6850 6851 status = vmcs_read16(GUEST_INTR_STATUS); 6852 old = (u8)status & 0xff; 6853 if ((u8)vector != old) { 6854 status &= ~0xff; 6855 status |= (u8)vector; 6856 vmcs_write16(GUEST_INTR_STATUS, status); 6857 } 6858 } 6859 6860 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 6861 { 6862 /* 6863 * When running L2, updating RVI is only relevant when 6864 * vmcs12 virtual-interrupt-delivery enabled. 6865 * However, it can be enabled only when L1 also 6866 * intercepts external-interrupts and in that case 6867 * we should not update vmcs02 RVI but instead intercept 6868 * interrupt. Therefore, do nothing when running L2. 6869 */ 6870 if (!is_guest_mode(vcpu)) 6871 vmx_set_rvi(max_irr); 6872 } 6873 6874 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6875 { 6876 struct vcpu_vmx *vmx = to_vmx(vcpu); 6877 int max_irr; 6878 bool got_posted_interrupt; 6879 6880 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 6881 return -EIO; 6882 6883 if (pi_test_on(&vmx->pi_desc)) { 6884 pi_clear_on(&vmx->pi_desc); 6885 /* 6886 * IOMMU can write to PID.ON, so the barrier matters even on UP. 6887 * But on x86 this is just a compiler barrier anyway. 6888 */ 6889 smp_mb__after_atomic(); 6890 got_posted_interrupt = 6891 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); 6892 } else { 6893 max_irr = kvm_lapic_find_highest_irr(vcpu); 6894 got_posted_interrupt = false; 6895 } 6896 6897 /* 6898 * Newly recognized interrupts are injected via either virtual interrupt 6899 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 6900 * disabled in two cases: 6901 * 6902 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 6903 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 6904 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 6905 * into L2, but KVM doesn't use virtual interrupt delivery to inject 6906 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 6907 * 6908 * 2) If APICv is disabled for this vCPU, assigned devices may still 6909 * attempt to post interrupts. The posted interrupt vector will cause 6910 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 6911 */ 6912 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 6913 vmx_set_rvi(max_irr); 6914 else if (got_posted_interrupt) 6915 kvm_make_request(KVM_REQ_EVENT, vcpu); 6916 6917 return max_irr; 6918 } 6919 6920 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6921 { 6922 if (!kvm_vcpu_apicv_active(vcpu)) 6923 return; 6924 6925 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6926 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6927 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6928 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6929 } 6930 6931 static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu) 6932 { 6933 struct vcpu_vmx *vmx = to_vmx(vcpu); 6934 6935 pi_clear_on(&vmx->pi_desc); 6936 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); 6937 } 6938 6939 void vmx_do_interrupt_irqoff(unsigned long entry); 6940 void vmx_do_nmi_irqoff(void); 6941 6942 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 6943 { 6944 /* 6945 * Save xfd_err to guest_fpu before interrupt is enabled, so the 6946 * MSR value is not clobbered by the host activity before the guest 6947 * has chance to consume it. 6948 * 6949 * Do not blindly read xfd_err here, since this exception might 6950 * be caused by L1 interception on a platform which doesn't 6951 * support xfd at all. 6952 * 6953 * Do it conditionally upon guest_fpu::xfd. xfd_err matters 6954 * only when xfd contains a non-zero value. 6955 * 6956 * Queuing exception is done in vmx_handle_exit. See comment there. 6957 */ 6958 if (vcpu->arch.guest_fpu.fpstate->xfd) 6959 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 6960 } 6961 6962 static void handle_exception_irqoff(struct vcpu_vmx *vmx) 6963 { 6964 u32 intr_info = vmx_get_intr_info(&vmx->vcpu); 6965 6966 /* if exit due to PF check for async PF */ 6967 if (is_page_fault(intr_info)) 6968 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 6969 /* if exit due to NM, handle before interrupts are enabled */ 6970 else if (is_nm_fault(intr_info)) 6971 handle_nm_fault_irqoff(&vmx->vcpu); 6972 /* Handle machine checks before interrupts are enabled */ 6973 else if (is_machine_check(intr_info)) 6974 kvm_machine_check(); 6975 } 6976 6977 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) 6978 { 6979 u32 intr_info = vmx_get_intr_info(vcpu); 6980 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 6981 gate_desc *desc = (gate_desc *)host_idt_base + vector; 6982 6983 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 6984 "unexpected VM-Exit interrupt info: 0x%x", intr_info)) 6985 return; 6986 6987 kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); 6988 vmx_do_interrupt_irqoff(gate_offset(desc)); 6989 kvm_after_interrupt(vcpu); 6990 6991 vcpu->arch.at_instruction_boundary = true; 6992 } 6993 6994 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 6995 { 6996 struct vcpu_vmx *vmx = to_vmx(vcpu); 6997 6998 if (vmx->emulation_required) 6999 return; 7000 7001 if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 7002 handle_external_interrupt_irqoff(vcpu); 7003 else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) 7004 handle_exception_irqoff(vmx); 7005 } 7006 7007 /* 7008 * The kvm parameter can be NULL (module initialization, or invocation before 7009 * VM creation). Be sure to check the kvm parameter before using it. 7010 */ 7011 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 7012 { 7013 switch (index) { 7014 case MSR_IA32_SMBASE: 7015 if (!IS_ENABLED(CONFIG_KVM_SMM)) 7016 return false; 7017 /* 7018 * We cannot do SMM unless we can run the guest in big 7019 * real mode. 7020 */ 7021 return enable_unrestricted_guest || emulate_invalid_guest_state; 7022 case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: 7023 return nested; 7024 case MSR_AMD64_VIRT_SPEC_CTRL: 7025 case MSR_AMD64_TSC_RATIO: 7026 /* This is AMD only. */ 7027 return false; 7028 default: 7029 return true; 7030 } 7031 } 7032 7033 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 7034 { 7035 u32 exit_intr_info; 7036 bool unblock_nmi; 7037 u8 vector; 7038 bool idtv_info_valid; 7039 7040 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7041 7042 if (enable_vnmi) { 7043 if (vmx->loaded_vmcs->nmi_known_unmasked) 7044 return; 7045 7046 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 7047 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 7048 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7049 /* 7050 * SDM 3: 27.7.1.2 (September 2008) 7051 * Re-set bit "block by NMI" before VM entry if vmexit caused by 7052 * a guest IRET fault. 7053 * SDM 3: 23.2.2 (September 2008) 7054 * Bit 12 is undefined in any of the following cases: 7055 * If the VM exit sets the valid bit in the IDT-vectoring 7056 * information field. 7057 * If the VM exit is due to a double fault. 7058 */ 7059 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 7060 vector != DF_VECTOR && !idtv_info_valid) 7061 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7062 GUEST_INTR_STATE_NMI); 7063 else 7064 vmx->loaded_vmcs->nmi_known_unmasked = 7065 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 7066 & GUEST_INTR_STATE_NMI); 7067 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 7068 vmx->loaded_vmcs->vnmi_blocked_time += 7069 ktime_to_ns(ktime_sub(ktime_get(), 7070 vmx->loaded_vmcs->entry_time)); 7071 } 7072 7073 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 7074 u32 idt_vectoring_info, 7075 int instr_len_field, 7076 int error_code_field) 7077 { 7078 u8 vector; 7079 int type; 7080 bool idtv_info_valid; 7081 7082 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 7083 7084 vcpu->arch.nmi_injected = false; 7085 kvm_clear_exception_queue(vcpu); 7086 kvm_clear_interrupt_queue(vcpu); 7087 7088 if (!idtv_info_valid) 7089 return; 7090 7091 kvm_make_request(KVM_REQ_EVENT, vcpu); 7092 7093 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 7094 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 7095 7096 switch (type) { 7097 case INTR_TYPE_NMI_INTR: 7098 vcpu->arch.nmi_injected = true; 7099 /* 7100 * SDM 3: 27.7.1.2 (September 2008) 7101 * Clear bit "block by NMI" before VM entry if a NMI 7102 * delivery faulted. 7103 */ 7104 vmx_set_nmi_mask(vcpu, false); 7105 break; 7106 case INTR_TYPE_SOFT_EXCEPTION: 7107 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7108 fallthrough; 7109 case INTR_TYPE_HARD_EXCEPTION: 7110 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 7111 u32 err = vmcs_read32(error_code_field); 7112 kvm_requeue_exception_e(vcpu, vector, err); 7113 } else 7114 kvm_requeue_exception(vcpu, vector); 7115 break; 7116 case INTR_TYPE_SOFT_INTR: 7117 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 7118 fallthrough; 7119 case INTR_TYPE_EXT_INTR: 7120 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 7121 break; 7122 default: 7123 break; 7124 } 7125 } 7126 7127 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 7128 { 7129 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 7130 VM_EXIT_INSTRUCTION_LEN, 7131 IDT_VECTORING_ERROR_CODE); 7132 } 7133 7134 static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 7135 { 7136 __vmx_complete_interrupts(vcpu, 7137 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 7138 VM_ENTRY_INSTRUCTION_LEN, 7139 VM_ENTRY_EXCEPTION_ERROR_CODE); 7140 7141 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 7142 } 7143 7144 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 7145 { 7146 int i, nr_msrs; 7147 struct perf_guest_switch_msr *msrs; 7148 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7149 7150 pmu->host_cross_mapped_mask = 0; 7151 if (pmu->pebs_enable & pmu->global_ctrl) 7152 intel_pmu_cross_mapped_check(pmu); 7153 7154 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 7155 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 7156 if (!msrs) 7157 return; 7158 7159 for (i = 0; i < nr_msrs; i++) 7160 if (msrs[i].host == msrs[i].guest) 7161 clear_atomic_switch_msr(vmx, msrs[i].msr); 7162 else 7163 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7164 msrs[i].host, false); 7165 } 7166 7167 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) 7168 { 7169 struct vcpu_vmx *vmx = to_vmx(vcpu); 7170 u64 tscl; 7171 u32 delta_tsc; 7172 7173 if (vmx->req_immediate_exit) { 7174 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7175 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7176 } else if (vmx->hv_deadline_tsc != -1) { 7177 tscl = rdtsc(); 7178 if (vmx->hv_deadline_tsc > tscl) 7179 /* set_hv_timer ensures the delta fits in 32-bits */ 7180 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7181 cpu_preemption_timer_multi); 7182 else 7183 delta_tsc = 0; 7184 7185 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7186 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7187 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7188 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7189 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7190 } 7191 } 7192 7193 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7194 { 7195 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7196 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7197 vmcs_writel(HOST_RSP, host_rsp); 7198 } 7199 } 7200 7201 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7202 unsigned int flags) 7203 { 7204 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7205 7206 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7207 return; 7208 7209 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7210 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); 7211 7212 /* 7213 * If the guest/host SPEC_CTRL values differ, restore the host value. 7214 * 7215 * For legacy IBRS, the IBRS bit always needs to be written after 7216 * transitioning from a less privileged predictor mode, regardless of 7217 * whether the guest/host values differ. 7218 */ 7219 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7220 vmx->spec_ctrl != hostval) 7221 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); 7222 7223 barrier_nospec(); 7224 } 7225 7226 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 7227 { 7228 switch (to_vmx(vcpu)->exit_reason.basic) { 7229 case EXIT_REASON_MSR_WRITE: 7230 return handle_fastpath_set_msr_irqoff(vcpu); 7231 case EXIT_REASON_PREEMPTION_TIMER: 7232 return handle_fastpath_preemption_timer(vcpu); 7233 default: 7234 return EXIT_FASTPATH_NONE; 7235 } 7236 } 7237 7238 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7239 unsigned int flags) 7240 { 7241 struct vcpu_vmx *vmx = to_vmx(vcpu); 7242 7243 guest_state_enter_irqoff(); 7244 7245 /* 7246 * L1D Flush includes CPU buffer clear to mitigate MDS, but VERW 7247 * mitigation for MDS is done late in VMentry and is still 7248 * executed in spite of L1D Flush. This is because an extra VERW 7249 * should not matter much after the big hammer L1D Flush. 7250 */ 7251 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7252 vmx_l1d_flush(vcpu); 7253 else if (static_branch_unlikely(&mmio_stale_data_clear) && 7254 kvm_arch_has_assigned_device(vcpu->kvm)) 7255 mds_clear_cpu_buffers(); 7256 7257 vmx_disable_fb_clear(vmx); 7258 7259 if (vcpu->arch.cr2 != native_read_cr2()) 7260 native_write_cr2(vcpu->arch.cr2); 7261 7262 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7263 flags); 7264 7265 vcpu->arch.cr2 = native_read_cr2(); 7266 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7267 7268 vmx->idt_vectoring_info = 0; 7269 7270 vmx_enable_fb_clear(vmx); 7271 7272 if (unlikely(vmx->fail)) { 7273 vmx->exit_reason.full = 0xdead; 7274 goto out; 7275 } 7276 7277 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7278 if (likely(!vmx->exit_reason.failed_vmentry)) 7279 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7280 7281 if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI && 7282 is_nmi(vmx_get_intr_info(vcpu))) { 7283 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 7284 vmx_do_nmi_irqoff(); 7285 kvm_after_interrupt(vcpu); 7286 } 7287 7288 out: 7289 guest_state_exit_irqoff(); 7290 } 7291 7292 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) 7293 { 7294 struct vcpu_vmx *vmx = to_vmx(vcpu); 7295 unsigned long cr3, cr4; 7296 7297 /* Record the guest's net vcpu time for enforced NMI injections. */ 7298 if (unlikely(!enable_vnmi && 7299 vmx->loaded_vmcs->soft_vnmi_blocked)) 7300 vmx->loaded_vmcs->entry_time = ktime_get(); 7301 7302 /* 7303 * Don't enter VMX if guest state is invalid, let the exit handler 7304 * start emulation until we arrive back to a valid state. Synthesize a 7305 * consistency check VM-Exit due to invalid guest state and bail. 7306 */ 7307 if (unlikely(vmx->emulation_required)) { 7308 vmx->fail = 0; 7309 7310 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; 7311 vmx->exit_reason.failed_vmentry = 1; 7312 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7313 vmx->exit_qualification = ENTRY_FAIL_DEFAULT; 7314 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7315 vmx->exit_intr_info = 0; 7316 return EXIT_FASTPATH_NONE; 7317 } 7318 7319 trace_kvm_entry(vcpu); 7320 7321 if (vmx->ple_window_dirty) { 7322 vmx->ple_window_dirty = false; 7323 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7324 } 7325 7326 /* 7327 * We did this in prepare_switch_to_guest, because it needs to 7328 * be within srcu_read_lock. 7329 */ 7330 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7331 7332 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7333 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7334 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7335 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7336 vcpu->arch.regs_dirty = 0; 7337 7338 /* 7339 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7340 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7341 * it switches back to the current->mm, which can occur in KVM context 7342 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7343 * toggles a static key while handling a VM-Exit. 7344 */ 7345 cr3 = __get_current_cr3_fast(); 7346 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7347 vmcs_writel(HOST_CR3, cr3); 7348 vmx->loaded_vmcs->host_state.cr3 = cr3; 7349 } 7350 7351 cr4 = cr4_read_shadow(); 7352 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7353 vmcs_writel(HOST_CR4, cr4); 7354 vmx->loaded_vmcs->host_state.cr4 = cr4; 7355 } 7356 7357 /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ 7358 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) 7359 set_debugreg(vcpu->arch.dr6, 6); 7360 7361 /* When single-stepping over STI and MOV SS, we must clear the 7362 * corresponding interruptibility bits in the guest state. Otherwise 7363 * vmentry fails as it then expects bit 14 (BS) in pending debug 7364 * exceptions being set, but that's not correct for the guest debugging 7365 * case. */ 7366 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7367 vmx_set_interrupt_shadow(vcpu, 0); 7368 7369 kvm_load_guest_xsave_state(vcpu); 7370 7371 pt_guest_enter(vmx); 7372 7373 atomic_switch_perf_msrs(vmx); 7374 if (intel_pmu_lbr_is_enabled(vcpu)) 7375 vmx_passthrough_lbr_msrs(vcpu); 7376 7377 if (enable_preemption_timer) 7378 vmx_update_hv_timer(vcpu); 7379 7380 kvm_wait_lapic_expire(vcpu); 7381 7382 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7383 vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx)); 7384 7385 /* All fields are clean at this point */ 7386 if (kvm_is_using_evmcs()) { 7387 current_evmcs->hv_clean_fields |= 7388 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7389 7390 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7391 } 7392 7393 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7394 if (vmx->host_debugctlmsr) 7395 update_debugctlmsr(vmx->host_debugctlmsr); 7396 7397 #ifndef CONFIG_X86_64 7398 /* 7399 * The sysexit path does not restore ds/es, so we must set them to 7400 * a reasonable value ourselves. 7401 * 7402 * We can't defer this to vmx_prepare_switch_to_host() since that 7403 * function may be executed in interrupt context, which saves and 7404 * restore segments around it, nullifying its effect. 7405 */ 7406 loadsegment(ds, __USER_DS); 7407 loadsegment(es, __USER_DS); 7408 #endif 7409 7410 pt_guest_exit(vmx); 7411 7412 kvm_load_host_xsave_state(vcpu); 7413 7414 if (is_guest_mode(vcpu)) { 7415 /* 7416 * Track VMLAUNCH/VMRESUME that have made past guest state 7417 * checking. 7418 */ 7419 if (vmx->nested.nested_run_pending && 7420 !vmx->exit_reason.failed_vmentry) 7421 ++vcpu->stat.nested_run; 7422 7423 vmx->nested.nested_run_pending = 0; 7424 } 7425 7426 if (unlikely(vmx->fail)) 7427 return EXIT_FASTPATH_NONE; 7428 7429 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7430 kvm_machine_check(); 7431 7432 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7433 7434 if (unlikely(vmx->exit_reason.failed_vmentry)) 7435 return EXIT_FASTPATH_NONE; 7436 7437 vmx->loaded_vmcs->launched = 1; 7438 7439 vmx_recover_nmi_blocking(vmx); 7440 vmx_complete_interrupts(vmx); 7441 7442 if (is_guest_mode(vcpu)) 7443 return EXIT_FASTPATH_NONE; 7444 7445 return vmx_exit_handlers_fastpath(vcpu); 7446 } 7447 7448 static void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7449 { 7450 struct vcpu_vmx *vmx = to_vmx(vcpu); 7451 7452 if (enable_pml) 7453 vmx_destroy_pml_buffer(vmx); 7454 free_vpid(vmx->vpid); 7455 nested_vmx_free_vcpu(vcpu); 7456 free_loaded_vmcs(vmx->loaded_vmcs); 7457 } 7458 7459 static int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7460 { 7461 struct vmx_uret_msr *tsx_ctrl; 7462 struct vcpu_vmx *vmx; 7463 int i, err; 7464 7465 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7466 vmx = to_vmx(vcpu); 7467 7468 INIT_LIST_HEAD(&vmx->pi_wakeup_list); 7469 7470 err = -ENOMEM; 7471 7472 vmx->vpid = allocate_vpid(); 7473 7474 /* 7475 * If PML is turned on, failure on enabling PML just results in failure 7476 * of creating the vcpu, therefore we can simplify PML logic (by 7477 * avoiding dealing with cases, such as enabling PML partially on vcpus 7478 * for the guest), etc. 7479 */ 7480 if (enable_pml) { 7481 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7482 if (!vmx->pml_pg) 7483 goto free_vpid; 7484 } 7485 7486 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7487 vmx->guest_uret_msrs[i].mask = -1ull; 7488 if (boot_cpu_has(X86_FEATURE_RTM)) { 7489 /* 7490 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7491 * Keep the host value unchanged to avoid changing CPUID bits 7492 * under the host kernel's feet. 7493 */ 7494 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7495 if (tsx_ctrl) 7496 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7497 } 7498 7499 err = alloc_loaded_vmcs(&vmx->vmcs01); 7500 if (err < 0) 7501 goto free_pml; 7502 7503 /* 7504 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7505 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7506 * feature only for vmcs01, KVM currently isn't equipped to realize any 7507 * performance benefits from enabling it for vmcs02. 7508 */ 7509 if (kvm_is_using_evmcs() && 7510 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7511 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7512 7513 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7514 } 7515 7516 /* The MSR bitmap starts with all ones */ 7517 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7518 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7519 7520 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 7521 #ifdef CONFIG_X86_64 7522 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 7523 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 7524 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 7525 #endif 7526 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 7527 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 7528 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 7529 if (kvm_cstate_in_guest(vcpu->kvm)) { 7530 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 7531 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 7532 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 7533 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 7534 } 7535 7536 vmx->loaded_vmcs = &vmx->vmcs01; 7537 7538 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7539 err = kvm_alloc_apic_access_page(vcpu->kvm); 7540 if (err) 7541 goto free_vmcs; 7542 } 7543 7544 if (enable_ept && !enable_unrestricted_guest) { 7545 err = init_rmode_identity_map(vcpu->kvm); 7546 if (err) 7547 goto free_vmcs; 7548 } 7549 7550 if (vmx_can_use_ipiv(vcpu)) 7551 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7552 __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); 7553 7554 return 0; 7555 7556 free_vmcs: 7557 free_loaded_vmcs(vmx->loaded_vmcs); 7558 free_pml: 7559 vmx_destroy_pml_buffer(vmx); 7560 free_vpid: 7561 free_vpid(vmx->vpid); 7562 return err; 7563 } 7564 7565 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7566 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7567 7568 static int vmx_vm_init(struct kvm *kvm) 7569 { 7570 if (!ple_gap) 7571 kvm->arch.pause_in_guest = true; 7572 7573 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7574 switch (l1tf_mitigation) { 7575 case L1TF_MITIGATION_OFF: 7576 case L1TF_MITIGATION_FLUSH_NOWARN: 7577 /* 'I explicitly don't care' is set */ 7578 break; 7579 case L1TF_MITIGATION_FLUSH: 7580 case L1TF_MITIGATION_FLUSH_NOSMT: 7581 case L1TF_MITIGATION_FULL: 7582 /* 7583 * Warn upon starting the first VM in a potentially 7584 * insecure environment. 7585 */ 7586 if (sched_smt_active()) 7587 pr_warn_once(L1TF_MSG_SMT); 7588 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7589 pr_warn_once(L1TF_MSG_L1D); 7590 break; 7591 case L1TF_MITIGATION_FULL_FORCE: 7592 /* Flush is enforced */ 7593 break; 7594 } 7595 } 7596 return 0; 7597 } 7598 7599 static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7600 { 7601 u8 cache; 7602 7603 /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in 7604 * memory aliases with conflicting memory types and sometimes MCEs. 7605 * We have to be careful as to what are honored and when. 7606 * 7607 * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to 7608 * UC. The effective memory type is UC or WC depending on guest PAT. 7609 * This was historically the source of MCEs and we want to be 7610 * conservative. 7611 * 7612 * When there is no need to deal with noncoherent DMA (e.g., no VT-d 7613 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The 7614 * EPT memory type is set to WB. The effective memory type is forced 7615 * WB. 7616 * 7617 * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The 7618 * EPT memory type is used to emulate guest CD/MTRR. 7619 */ 7620 7621 if (is_mmio) 7622 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7623 7624 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) 7625 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7626 7627 if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) { 7628 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 7629 cache = MTRR_TYPE_WRBACK; 7630 else 7631 cache = MTRR_TYPE_UNCACHABLE; 7632 7633 return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7634 } 7635 7636 return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; 7637 } 7638 7639 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7640 { 7641 /* 7642 * These bits in the secondary execution controls field 7643 * are dynamic, the others are mostly based on the hypervisor 7644 * architecture and the guest's CPUID. Do not touch the 7645 * dynamic bits. 7646 */ 7647 u32 mask = 7648 SECONDARY_EXEC_SHADOW_VMCS | 7649 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7650 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7651 SECONDARY_EXEC_DESC; 7652 7653 u32 cur_ctl = secondary_exec_controls_get(vmx); 7654 7655 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7656 } 7657 7658 /* 7659 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7660 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7661 */ 7662 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7663 { 7664 struct vcpu_vmx *vmx = to_vmx(vcpu); 7665 struct kvm_cpuid_entry2 *entry; 7666 7667 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7668 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7669 7670 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7671 if (entry && (entry->_reg & (_cpuid_mask))) \ 7672 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7673 } while (0) 7674 7675 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7676 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7677 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7678 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7679 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7680 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7681 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7682 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7683 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7684 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7685 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7686 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7687 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7688 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7689 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7690 7691 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7692 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7693 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7694 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7695 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7696 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7697 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7698 7699 #undef cr4_fixed1_update 7700 } 7701 7702 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7703 { 7704 struct vcpu_vmx *vmx = to_vmx(vcpu); 7705 struct kvm_cpuid_entry2 *best = NULL; 7706 int i; 7707 7708 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7709 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7710 if (!best) 7711 return; 7712 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7713 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7714 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7715 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7716 } 7717 7718 /* Get the number of configurable Address Ranges for filtering */ 7719 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7720 PT_CAP_num_address_ranges); 7721 7722 /* Initialize and clear the no dependency bits */ 7723 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7724 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7725 RTIT_CTL_BRANCH_EN); 7726 7727 /* 7728 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7729 * will inject an #GP 7730 */ 7731 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7732 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7733 7734 /* 7735 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7736 * PSBFreq can be set 7737 */ 7738 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7739 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7740 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7741 7742 /* 7743 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7744 */ 7745 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7746 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7747 RTIT_CTL_MTC_RANGE); 7748 7749 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7750 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7751 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7752 RTIT_CTL_PTW_EN); 7753 7754 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7755 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7756 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7757 7758 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7759 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7760 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7761 7762 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7763 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7764 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7765 7766 /* unmask address range configure area */ 7767 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7768 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7769 } 7770 7771 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7772 { 7773 struct vcpu_vmx *vmx = to_vmx(vcpu); 7774 7775 /* 7776 * XSAVES is effectively enabled if and only if XSAVE is also exposed 7777 * to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be 7778 * set if and only if XSAVE is supported. 7779 */ 7780 if (boot_cpu_has(X86_FEATURE_XSAVE) && 7781 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) 7782 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES); 7783 7784 kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX); 7785 7786 vmx_setup_uret_msrs(vmx); 7787 7788 if (cpu_has_secondary_exec_ctrls()) 7789 vmcs_set_secondary_exec_control(vmx, 7790 vmx_secondary_exec_control(vmx)); 7791 7792 if (guest_can_use(vcpu, X86_FEATURE_VMX)) 7793 vmx->msr_ia32_feature_control_valid_bits |= 7794 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7795 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7796 else 7797 vmx->msr_ia32_feature_control_valid_bits &= 7798 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7799 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7800 7801 if (guest_can_use(vcpu, X86_FEATURE_VMX)) 7802 nested_vmx_cr_fixed1_bits_update(vcpu); 7803 7804 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7805 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) 7806 update_intel_pt_cfg(vcpu); 7807 7808 if (boot_cpu_has(X86_FEATURE_RTM)) { 7809 struct vmx_uret_msr *msr; 7810 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7811 if (msr) { 7812 bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); 7813 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 7814 } 7815 } 7816 7817 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 7818 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 7819 !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); 7820 7821 if (boot_cpu_has(X86_FEATURE_IBPB)) 7822 vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W, 7823 !guest_has_pred_cmd_msr(vcpu)); 7824 7825 if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) 7826 vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, 7827 !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); 7828 7829 set_cr4_guest_host_mask(vmx); 7830 7831 vmx_write_encls_bitmap(vcpu, NULL); 7832 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) 7833 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 7834 else 7835 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 7836 7837 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) 7838 vmx->msr_ia32_feature_control_valid_bits |= 7839 FEAT_CTL_SGX_LC_ENABLED; 7840 else 7841 vmx->msr_ia32_feature_control_valid_bits &= 7842 ~FEAT_CTL_SGX_LC_ENABLED; 7843 7844 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7845 vmx_update_exception_bitmap(vcpu); 7846 } 7847 7848 static u64 vmx_get_perf_capabilities(void) 7849 { 7850 u64 perf_cap = PMU_CAP_FW_WRITES; 7851 struct x86_pmu_lbr lbr; 7852 u64 host_perf_cap = 0; 7853 7854 if (!enable_pmu) 7855 return 0; 7856 7857 if (boot_cpu_has(X86_FEATURE_PDCM)) 7858 rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 7859 7860 if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { 7861 x86_perf_get_lbr(&lbr); 7862 if (lbr.nr) 7863 perf_cap |= host_perf_cap & PMU_CAP_LBR_FMT; 7864 } 7865 7866 if (vmx_pebs_supported()) { 7867 perf_cap |= host_perf_cap & PERF_CAP_PEBS_MASK; 7868 7869 /* 7870 * Disallow adaptive PEBS as it is functionally broken, can be 7871 * used by the guest to read *host* LBRs, and can be used to 7872 * bypass userspace event filters. To correctly and safely 7873 * support adaptive PEBS, KVM needs to: 7874 * 7875 * 1. Account for the ADAPTIVE flag when (re)programming fixed 7876 * counters. 7877 * 7878 * 2. Gain support from perf (or take direct control of counter 7879 * programming) to support events without adaptive PEBS 7880 * enabled for the hardware counter. 7881 * 7882 * 3. Ensure LBR MSRs cannot hold host data on VM-Entry with 7883 * adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1. 7884 * 7885 * 4. Document which PMU events are effectively exposed to the 7886 * guest via adaptive PEBS, and make adaptive PEBS mutually 7887 * exclusive with KVM_SET_PMU_EVENT_FILTER if necessary. 7888 */ 7889 perf_cap &= ~PERF_CAP_PEBS_BASELINE; 7890 } 7891 7892 return perf_cap; 7893 } 7894 7895 static __init void vmx_set_cpu_caps(void) 7896 { 7897 kvm_set_cpu_caps(); 7898 7899 /* CPUID 0x1 */ 7900 if (nested) 7901 kvm_cpu_cap_set(X86_FEATURE_VMX); 7902 7903 /* CPUID 0x7 */ 7904 if (kvm_mpx_supported()) 7905 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 7906 if (!cpu_has_vmx_invpcid()) 7907 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 7908 if (vmx_pt_mode_is_host_guest()) 7909 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 7910 if (vmx_pebs_supported()) { 7911 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 7912 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 7913 } 7914 7915 if (!enable_pmu) 7916 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 7917 kvm_caps.supported_perf_cap = vmx_get_perf_capabilities(); 7918 7919 if (!enable_sgx) { 7920 kvm_cpu_cap_clear(X86_FEATURE_SGX); 7921 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7922 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7923 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7924 } 7925 7926 if (vmx_umip_emulated()) 7927 kvm_cpu_cap_set(X86_FEATURE_UMIP); 7928 7929 /* CPUID 0xD.1 */ 7930 kvm_caps.supported_xss = 0; 7931 if (!cpu_has_vmx_xsaves()) 7932 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7933 7934 /* CPUID 0x80000001 and 0x7 (RDPID) */ 7935 if (!cpu_has_vmx_rdtscp()) { 7936 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 7937 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 7938 } 7939 7940 if (cpu_has_vmx_waitpkg()) 7941 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7942 } 7943 7944 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) 7945 { 7946 to_vmx(vcpu)->req_immediate_exit = true; 7947 } 7948 7949 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, 7950 struct x86_instruction_info *info) 7951 { 7952 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7953 unsigned short port; 7954 bool intercept; 7955 int size; 7956 7957 if (info->intercept == x86_intercept_in || 7958 info->intercept == x86_intercept_ins) { 7959 port = info->src_val; 7960 size = info->dst_bytes; 7961 } else { 7962 port = info->dst_val; 7963 size = info->src_bytes; 7964 } 7965 7966 /* 7967 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 7968 * VM-exits depend on the 'unconditional IO exiting' VM-execution 7969 * control. 7970 * 7971 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 7972 */ 7973 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 7974 intercept = nested_cpu_has(vmcs12, 7975 CPU_BASED_UNCOND_IO_EXITING); 7976 else 7977 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); 7978 7979 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 7980 return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; 7981 } 7982 7983 static int vmx_check_intercept(struct kvm_vcpu *vcpu, 7984 struct x86_instruction_info *info, 7985 enum x86_intercept_stage stage, 7986 struct x86_exception *exception) 7987 { 7988 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7989 7990 switch (info->intercept) { 7991 /* 7992 * RDPID causes #UD if disabled through secondary execution controls. 7993 * Because it is marked as EmulateOnUD, we need to intercept it here. 7994 * Note, RDPID is hidden behind ENABLE_RDTSCP. 7995 */ 7996 case x86_intercept_rdpid: 7997 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 7998 exception->vector = UD_VECTOR; 7999 exception->error_code_valid = false; 8000 return X86EMUL_PROPAGATE_FAULT; 8001 } 8002 break; 8003 8004 case x86_intercept_in: 8005 case x86_intercept_ins: 8006 case x86_intercept_out: 8007 case x86_intercept_outs: 8008 return vmx_check_intercept_io(vcpu, info); 8009 8010 case x86_intercept_lgdt: 8011 case x86_intercept_lidt: 8012 case x86_intercept_lldt: 8013 case x86_intercept_ltr: 8014 case x86_intercept_sgdt: 8015 case x86_intercept_sidt: 8016 case x86_intercept_sldt: 8017 case x86_intercept_str: 8018 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 8019 return X86EMUL_CONTINUE; 8020 8021 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 8022 break; 8023 8024 case x86_intercept_pause: 8025 /* 8026 * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides 8027 * with vanilla NOPs in the emulator. Apply the interception 8028 * check only to actual PAUSE instructions. Don't check 8029 * PAUSE-loop-exiting, software can't expect a given PAUSE to 8030 * exit, i.e. KVM is within its rights to allow L2 to execute 8031 * the PAUSE. 8032 */ 8033 if ((info->rep_prefix != REPE_PREFIX) || 8034 !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) 8035 return X86EMUL_CONTINUE; 8036 8037 break; 8038 8039 /* TODO: check more intercepts... */ 8040 default: 8041 break; 8042 } 8043 8044 return X86EMUL_UNHANDLEABLE; 8045 } 8046 8047 #ifdef CONFIG_X86_64 8048 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 8049 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 8050 u64 divisor, u64 *result) 8051 { 8052 u64 low = a << shift, high = a >> (64 - shift); 8053 8054 /* To avoid the overflow on divq */ 8055 if (high >= divisor) 8056 return 1; 8057 8058 /* Low hold the result, high hold rem which is discarded */ 8059 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 8060 "rm" (divisor), "0" (low), "1" (high)); 8061 *result = low; 8062 8063 return 0; 8064 } 8065 8066 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 8067 bool *expired) 8068 { 8069 struct vcpu_vmx *vmx; 8070 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 8071 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 8072 8073 vmx = to_vmx(vcpu); 8074 tscl = rdtsc(); 8075 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 8076 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 8077 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 8078 ktimer->timer_advance_ns); 8079 8080 if (delta_tsc > lapic_timer_advance_cycles) 8081 delta_tsc -= lapic_timer_advance_cycles; 8082 else 8083 delta_tsc = 0; 8084 8085 /* Convert to host delta tsc if tsc scaling is enabled */ 8086 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 8087 delta_tsc && u64_shl_div_u64(delta_tsc, 8088 kvm_caps.tsc_scaling_ratio_frac_bits, 8089 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 8090 return -ERANGE; 8091 8092 /* 8093 * If the delta tsc can't fit in the 32 bit after the multi shift, 8094 * we can't use the preemption timer. 8095 * It's possible that it fits on later vmentries, but checking 8096 * on every vmentry is costly so we just use an hrtimer. 8097 */ 8098 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 8099 return -ERANGE; 8100 8101 vmx->hv_deadline_tsc = tscl + delta_tsc; 8102 *expired = !delta_tsc; 8103 return 0; 8104 } 8105 8106 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 8107 { 8108 to_vmx(vcpu)->hv_deadline_tsc = -1; 8109 } 8110 #endif 8111 8112 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 8113 { 8114 if (!kvm_pause_in_guest(vcpu->kvm)) 8115 shrink_ple_window(vcpu); 8116 } 8117 8118 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 8119 { 8120 struct vcpu_vmx *vmx = to_vmx(vcpu); 8121 8122 if (WARN_ON_ONCE(!enable_pml)) 8123 return; 8124 8125 if (is_guest_mode(vcpu)) { 8126 vmx->nested.update_vmcs01_cpu_dirty_logging = true; 8127 return; 8128 } 8129 8130 /* 8131 * Note, nr_memslots_dirty_logging can be changed concurrent with this 8132 * code, but in that case another update request will be made and so 8133 * the guest will never run with a stale PML value. 8134 */ 8135 if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging)) 8136 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8137 else 8138 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 8139 } 8140 8141 static void vmx_setup_mce(struct kvm_vcpu *vcpu) 8142 { 8143 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 8144 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 8145 FEAT_CTL_LMCE_ENABLED; 8146 else 8147 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 8148 ~FEAT_CTL_LMCE_ENABLED; 8149 } 8150 8151 #ifdef CONFIG_KVM_SMM 8152 static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 8153 { 8154 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 8155 if (to_vmx(vcpu)->nested.nested_run_pending) 8156 return -EBUSY; 8157 return !is_smm(vcpu); 8158 } 8159 8160 static int vmx_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) 8161 { 8162 struct vcpu_vmx *vmx = to_vmx(vcpu); 8163 8164 /* 8165 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 8166 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 8167 * SMI and RSM only modify state that is saved and restored via SMRAM. 8168 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 8169 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 8170 */ 8171 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 8172 if (vmx->nested.smm.guest_mode) 8173 nested_vmx_vmexit(vcpu, -1, 0, 0); 8174 8175 vmx->nested.smm.vmxon = vmx->nested.vmxon; 8176 vmx->nested.vmxon = false; 8177 vmx_clear_hlt(vcpu); 8178 return 0; 8179 } 8180 8181 static int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) 8182 { 8183 struct vcpu_vmx *vmx = to_vmx(vcpu); 8184 int ret; 8185 8186 if (vmx->nested.smm.vmxon) { 8187 vmx->nested.vmxon = true; 8188 vmx->nested.smm.vmxon = false; 8189 } 8190 8191 if (vmx->nested.smm.guest_mode) { 8192 ret = nested_vmx_enter_non_root_mode(vcpu, false); 8193 if (ret) 8194 return ret; 8195 8196 vmx->nested.nested_run_pending = 1; 8197 vmx->nested.smm.guest_mode = false; 8198 } 8199 return 0; 8200 } 8201 8202 static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 8203 { 8204 /* RSM will cause a vmexit anyway. */ 8205 } 8206 #endif 8207 8208 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 8209 { 8210 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 8211 } 8212 8213 static void vmx_migrate_timers(struct kvm_vcpu *vcpu) 8214 { 8215 if (is_guest_mode(vcpu)) { 8216 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 8217 8218 if (hrtimer_try_to_cancel(timer) == 1) 8219 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 8220 } 8221 } 8222 8223 static void vmx_hardware_unsetup(void) 8224 { 8225 kvm_set_posted_intr_wakeup_handler(NULL); 8226 8227 if (nested) 8228 nested_vmx_hardware_unsetup(); 8229 8230 free_kvm_area(); 8231 } 8232 8233 #define VMX_REQUIRED_APICV_INHIBITS \ 8234 ( \ 8235 BIT(APICV_INHIBIT_REASON_DISABLE)| \ 8236 BIT(APICV_INHIBIT_REASON_ABSENT) | \ 8237 BIT(APICV_INHIBIT_REASON_HYPERV) | \ 8238 BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \ 8239 BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \ 8240 BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \ 8241 BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \ 8242 ) 8243 8244 static void vmx_vm_destroy(struct kvm *kvm) 8245 { 8246 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8247 8248 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8249 } 8250 8251 static struct kvm_x86_ops vmx_x86_ops __initdata = { 8252 .name = KBUILD_MODNAME, 8253 8254 .check_processor_compatibility = vmx_check_processor_compat, 8255 8256 .hardware_unsetup = vmx_hardware_unsetup, 8257 8258 .hardware_enable = vmx_hardware_enable, 8259 .hardware_disable = vmx_hardware_disable, 8260 .has_emulated_msr = vmx_has_emulated_msr, 8261 8262 .vm_size = sizeof(struct kvm_vmx), 8263 .vm_init = vmx_vm_init, 8264 .vm_destroy = vmx_vm_destroy, 8265 8266 .vcpu_precreate = vmx_vcpu_precreate, 8267 .vcpu_create = vmx_vcpu_create, 8268 .vcpu_free = vmx_vcpu_free, 8269 .vcpu_reset = vmx_vcpu_reset, 8270 8271 .prepare_switch_to_guest = vmx_prepare_switch_to_guest, 8272 .vcpu_load = vmx_vcpu_load, 8273 .vcpu_put = vmx_vcpu_put, 8274 8275 .update_exception_bitmap = vmx_update_exception_bitmap, 8276 .get_msr_feature = vmx_get_msr_feature, 8277 .get_msr = vmx_get_msr, 8278 .set_msr = vmx_set_msr, 8279 .get_segment_base = vmx_get_segment_base, 8280 .get_segment = vmx_get_segment, 8281 .set_segment = vmx_set_segment, 8282 .get_cpl = vmx_get_cpl, 8283 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 8284 .is_valid_cr0 = vmx_is_valid_cr0, 8285 .set_cr0 = vmx_set_cr0, 8286 .is_valid_cr4 = vmx_is_valid_cr4, 8287 .set_cr4 = vmx_set_cr4, 8288 .set_efer = vmx_set_efer, 8289 .get_idt = vmx_get_idt, 8290 .set_idt = vmx_set_idt, 8291 .get_gdt = vmx_get_gdt, 8292 .set_gdt = vmx_set_gdt, 8293 .set_dr7 = vmx_set_dr7, 8294 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, 8295 .cache_reg = vmx_cache_reg, 8296 .get_rflags = vmx_get_rflags, 8297 .set_rflags = vmx_set_rflags, 8298 .get_if_flag = vmx_get_if_flag, 8299 8300 .flush_tlb_all = vmx_flush_tlb_all, 8301 .flush_tlb_current = vmx_flush_tlb_current, 8302 .flush_tlb_gva = vmx_flush_tlb_gva, 8303 .flush_tlb_guest = vmx_flush_tlb_guest, 8304 8305 .vcpu_pre_run = vmx_vcpu_pre_run, 8306 .vcpu_run = vmx_vcpu_run, 8307 .handle_exit = vmx_handle_exit, 8308 .skip_emulated_instruction = vmx_skip_emulated_instruction, 8309 .update_emulated_instruction = vmx_update_emulated_instruction, 8310 .set_interrupt_shadow = vmx_set_interrupt_shadow, 8311 .get_interrupt_shadow = vmx_get_interrupt_shadow, 8312 .patch_hypercall = vmx_patch_hypercall, 8313 .inject_irq = vmx_inject_irq, 8314 .inject_nmi = vmx_inject_nmi, 8315 .inject_exception = vmx_inject_exception, 8316 .cancel_injection = vmx_cancel_injection, 8317 .interrupt_allowed = vmx_interrupt_allowed, 8318 .nmi_allowed = vmx_nmi_allowed, 8319 .get_nmi_mask = vmx_get_nmi_mask, 8320 .set_nmi_mask = vmx_set_nmi_mask, 8321 .enable_nmi_window = vmx_enable_nmi_window, 8322 .enable_irq_window = vmx_enable_irq_window, 8323 .update_cr8_intercept = vmx_update_cr8_intercept, 8324 .set_virtual_apic_mode = vmx_set_virtual_apic_mode, 8325 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 8326 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, 8327 .load_eoi_exitmap = vmx_load_eoi_exitmap, 8328 .apicv_pre_state_restore = vmx_apicv_pre_state_restore, 8329 .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS, 8330 .hwapic_irr_update = vmx_hwapic_irr_update, 8331 .hwapic_isr_update = vmx_hwapic_isr_update, 8332 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, 8333 .sync_pir_to_irr = vmx_sync_pir_to_irr, 8334 .deliver_interrupt = vmx_deliver_interrupt, 8335 .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, 8336 8337 .set_tss_addr = vmx_set_tss_addr, 8338 .set_identity_map_addr = vmx_set_identity_map_addr, 8339 .get_mt_mask = vmx_get_mt_mask, 8340 8341 .get_exit_info = vmx_get_exit_info, 8342 8343 .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, 8344 8345 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 8346 8347 .get_l2_tsc_offset = vmx_get_l2_tsc_offset, 8348 .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, 8349 .write_tsc_offset = vmx_write_tsc_offset, 8350 .write_tsc_multiplier = vmx_write_tsc_multiplier, 8351 8352 .load_mmu_pgd = vmx_load_mmu_pgd, 8353 8354 .check_intercept = vmx_check_intercept, 8355 .handle_exit_irqoff = vmx_handle_exit_irqoff, 8356 8357 .request_immediate_exit = vmx_request_immediate_exit, 8358 8359 .sched_in = vmx_sched_in, 8360 8361 .cpu_dirty_log_size = PML_ENTITY_NUM, 8362 .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, 8363 8364 .nested_ops = &vmx_nested_ops, 8365 8366 .pi_update_irte = vmx_pi_update_irte, 8367 .pi_start_assignment = vmx_pi_start_assignment, 8368 8369 #ifdef CONFIG_X86_64 8370 .set_hv_timer = vmx_set_hv_timer, 8371 .cancel_hv_timer = vmx_cancel_hv_timer, 8372 #endif 8373 8374 .setup_mce = vmx_setup_mce, 8375 8376 #ifdef CONFIG_KVM_SMM 8377 .smi_allowed = vmx_smi_allowed, 8378 .enter_smm = vmx_enter_smm, 8379 .leave_smm = vmx_leave_smm, 8380 .enable_smi_window = vmx_enable_smi_window, 8381 #endif 8382 8383 .can_emulate_instruction = vmx_can_emulate_instruction, 8384 .apic_init_signal_blocked = vmx_apic_init_signal_blocked, 8385 .migrate_timers = vmx_migrate_timers, 8386 8387 .msr_filter_changed = vmx_msr_filter_changed, 8388 .complete_emulated_msr = kvm_complete_insn_gp, 8389 8390 .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, 8391 }; 8392 8393 static unsigned int vmx_handle_intel_pt_intr(void) 8394 { 8395 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8396 8397 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8398 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8399 return 0; 8400 8401 kvm_make_request(KVM_REQ_PMI, vcpu); 8402 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8403 (unsigned long *)&vcpu->arch.pmu.global_status); 8404 return 1; 8405 } 8406 8407 static __init void vmx_setup_user_return_msrs(void) 8408 { 8409 8410 /* 8411 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8412 * will emulate SYSCALL in legacy mode if the vendor string in guest 8413 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8414 * support this emulation, MSR_STAR is included in the list for i386, 8415 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8416 * into hardware and is here purely for emulation purposes. 8417 */ 8418 const u32 vmx_uret_msrs_list[] = { 8419 #ifdef CONFIG_X86_64 8420 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8421 #endif 8422 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8423 MSR_IA32_TSX_CTRL, 8424 }; 8425 int i; 8426 8427 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8428 8429 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8430 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8431 } 8432 8433 static void __init vmx_setup_me_spte_mask(void) 8434 { 8435 u64 me_mask = 0; 8436 8437 /* 8438 * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use 8439 * the former to avoid exposing shadow_phys_bits. 8440 * 8441 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8442 * shadow_phys_bits. On MKTME and/or TDX capable systems, 8443 * boot_cpu_data.x86_phys_bits holds the actual physical address 8444 * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR 8445 * reported by CPUID. Those bits between are KeyID bits. 8446 */ 8447 if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) 8448 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8449 kvm_get_shadow_phys_bits() - 1); 8450 /* 8451 * Unlike SME, host kernel doesn't support setting up any 8452 * MKTME KeyID on Intel platforms. No memory encryption 8453 * bits should be included into the SPTE. 8454 */ 8455 kvm_mmu_set_me_spte_mask(0, me_mask); 8456 } 8457 8458 static struct kvm_x86_init_ops vmx_init_ops __initdata; 8459 8460 static __init int hardware_setup(void) 8461 { 8462 unsigned long host_bndcfgs; 8463 struct desc_ptr dt; 8464 int r; 8465 8466 store_idt(&dt); 8467 host_idt_base = dt.address; 8468 8469 vmx_setup_user_return_msrs(); 8470 8471 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8472 return -EIO; 8473 8474 if (cpu_has_perf_global_ctrl_bug()) 8475 pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 8476 "does not work properly. Using workaround\n"); 8477 8478 if (boot_cpu_has(X86_FEATURE_NX)) 8479 kvm_enable_efer_bits(EFER_NX); 8480 8481 if (boot_cpu_has(X86_FEATURE_MPX)) { 8482 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); 8483 WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost"); 8484 } 8485 8486 if (!cpu_has_vmx_mpx()) 8487 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8488 XFEATURE_MASK_BNDCSR); 8489 8490 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8491 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8492 enable_vpid = 0; 8493 8494 if (!cpu_has_vmx_ept() || 8495 !cpu_has_vmx_ept_4levels() || 8496 !cpu_has_vmx_ept_mt_wb() || 8497 !cpu_has_vmx_invept_global()) 8498 enable_ept = 0; 8499 8500 /* NX support is required for shadow paging. */ 8501 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8502 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 8503 return -EOPNOTSUPP; 8504 } 8505 8506 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8507 enable_ept_ad_bits = 0; 8508 8509 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8510 enable_unrestricted_guest = 0; 8511 8512 if (!cpu_has_vmx_flexpriority()) 8513 flexpriority_enabled = 0; 8514 8515 if (!cpu_has_virtual_nmis()) 8516 enable_vnmi = 0; 8517 8518 #ifdef CONFIG_X86_SGX_KVM 8519 if (!cpu_has_vmx_encls_vmexit()) 8520 enable_sgx = false; 8521 #endif 8522 8523 /* 8524 * set_apic_access_page_addr() is used to reload apic access 8525 * page upon invalidation. No need to do anything if not 8526 * using the APIC_ACCESS_ADDR VMCS field. 8527 */ 8528 if (!flexpriority_enabled) 8529 vmx_x86_ops.set_apic_access_page_addr = NULL; 8530 8531 if (!cpu_has_vmx_tpr_shadow()) 8532 vmx_x86_ops.update_cr8_intercept = NULL; 8533 8534 #if IS_ENABLED(CONFIG_HYPERV) 8535 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8536 && enable_ept) { 8537 vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs; 8538 vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range; 8539 } 8540 #endif 8541 8542 if (!cpu_has_vmx_ple()) { 8543 ple_gap = 0; 8544 ple_window = 0; 8545 ple_window_grow = 0; 8546 ple_window_max = 0; 8547 ple_window_shrink = 0; 8548 } 8549 8550 if (!cpu_has_vmx_apicv()) 8551 enable_apicv = 0; 8552 if (!enable_apicv) 8553 vmx_x86_ops.sync_pir_to_irr = NULL; 8554 8555 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8556 enable_ipiv = false; 8557 8558 if (cpu_has_vmx_tsc_scaling()) 8559 kvm_caps.has_tsc_control = true; 8560 8561 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8562 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8563 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8564 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8565 8566 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8567 8568 if (enable_ept) 8569 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8570 cpu_has_vmx_ept_execute_only()); 8571 8572 /* 8573 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8574 * bits to shadow_zero_check. 8575 */ 8576 vmx_setup_me_spte_mask(); 8577 8578 kvm_configure_mmu(enable_ept, 0, vmx_get_max_ept_level(), 8579 ept_caps_to_lpage_level(vmx_capability.ept)); 8580 8581 /* 8582 * Only enable PML when hardware supports PML feature, and both EPT 8583 * and EPT A/D bit features are enabled -- PML depends on them to work. 8584 */ 8585 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8586 enable_pml = 0; 8587 8588 if (!enable_pml) 8589 vmx_x86_ops.cpu_dirty_log_size = 0; 8590 8591 if (!cpu_has_vmx_preemption_timer()) 8592 enable_preemption_timer = false; 8593 8594 if (enable_preemption_timer) { 8595 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8596 8597 cpu_preemption_timer_multi = 8598 vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 8599 8600 if (tsc_khz) 8601 use_timer_freq = (u64)tsc_khz * 1000; 8602 use_timer_freq >>= cpu_preemption_timer_multi; 8603 8604 /* 8605 * KVM "disables" the preemption timer by setting it to its max 8606 * value. Don't use the timer if it might cause spurious exits 8607 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8608 */ 8609 if (use_timer_freq > 0xffffffffu / 10) 8610 enable_preemption_timer = false; 8611 } 8612 8613 if (!enable_preemption_timer) { 8614 vmx_x86_ops.set_hv_timer = NULL; 8615 vmx_x86_ops.cancel_hv_timer = NULL; 8616 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; 8617 } 8618 8619 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8620 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8621 8622 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8623 return -EINVAL; 8624 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8625 pt_mode = PT_MODE_SYSTEM; 8626 if (pt_mode == PT_MODE_HOST_GUEST) 8627 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8628 else 8629 vmx_init_ops.handle_intel_pt_intr = NULL; 8630 8631 setup_default_sgx_lepubkeyhash(); 8632 8633 if (nested) { 8634 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 8635 8636 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8637 if (r) 8638 return r; 8639 } 8640 8641 vmx_set_cpu_caps(); 8642 8643 r = alloc_kvm_area(); 8644 if (r && nested) 8645 nested_vmx_hardware_unsetup(); 8646 8647 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8648 8649 return r; 8650 } 8651 8652 static struct kvm_x86_init_ops vmx_init_ops __initdata = { 8653 .hardware_setup = hardware_setup, 8654 .handle_intel_pt_intr = NULL, 8655 8656 .runtime_ops = &vmx_x86_ops, 8657 .pmu_ops = &intel_pmu_ops, 8658 }; 8659 8660 static void vmx_cleanup_l1d_flush(void) 8661 { 8662 if (vmx_l1d_flush_pages) { 8663 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8664 vmx_l1d_flush_pages = NULL; 8665 } 8666 /* Restore state so sysfs ignores VMX */ 8667 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8668 } 8669 8670 static void __vmx_exit(void) 8671 { 8672 allow_smaller_maxphyaddr = false; 8673 8674 cpu_emergency_unregister_virt_callback(vmx_emergency_disable); 8675 8676 vmx_cleanup_l1d_flush(); 8677 } 8678 8679 static void vmx_exit(void) 8680 { 8681 kvm_exit(); 8682 kvm_x86_vendor_exit(); 8683 8684 __vmx_exit(); 8685 } 8686 module_exit(vmx_exit); 8687 8688 static int __init vmx_init(void) 8689 { 8690 int r, cpu; 8691 8692 if (!kvm_is_vmx_supported()) 8693 return -EOPNOTSUPP; 8694 8695 /* 8696 * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing 8697 * to unwind if a later step fails. 8698 */ 8699 hv_init_evmcs(); 8700 8701 r = kvm_x86_vendor_init(&vmx_init_ops); 8702 if (r) 8703 return r; 8704 8705 /* 8706 * Must be called after common x86 init so enable_ept is properly set 8707 * up. Hand the parameter mitigation value in which was stored in 8708 * the pre module init parser. If no parameter was given, it will 8709 * contain 'auto' which will be turned into the default 'cond' 8710 * mitigation mode. 8711 */ 8712 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8713 if (r) 8714 goto err_l1d_flush; 8715 8716 for_each_possible_cpu(cpu) { 8717 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8718 8719 pi_init_cpu(cpu); 8720 } 8721 8722 cpu_emergency_register_virt_callback(vmx_emergency_disable); 8723 8724 vmx_check_vmcs12_offsets(); 8725 8726 /* 8727 * Shadow paging doesn't have a (further) performance penalty 8728 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8729 * by default 8730 */ 8731 if (!enable_ept) 8732 allow_smaller_maxphyaddr = true; 8733 8734 /* 8735 * Common KVM initialization _must_ come last, after this, /dev/kvm is 8736 * exposed to userspace! 8737 */ 8738 r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), 8739 THIS_MODULE); 8740 if (r) 8741 goto err_kvm_init; 8742 8743 return 0; 8744 8745 err_kvm_init: 8746 __vmx_exit(); 8747 err_l1d_flush: 8748 kvm_x86_vendor_exit(); 8749 return r; 8750 } 8751 module_init(vmx_init); 8752