1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16 #include <linux/highmem.h> 17 #include <linux/hrtimer.h> 18 #include <linux/kernel.h> 19 #include <linux/kvm_host.h> 20 #include <linux/module.h> 21 #include <linux/moduleparam.h> 22 #include <linux/mod_devicetable.h> 23 #include <linux/mm.h> 24 #include <linux/objtool.h> 25 #include <linux/sched.h> 26 #include <linux/sched/smt.h> 27 #include <linux/slab.h> 28 #include <linux/tboot.h> 29 #include <linux/trace_events.h> 30 #include <linux/entry-kvm.h> 31 32 #include <asm/apic.h> 33 #include <asm/asm.h> 34 #include <asm/cpu.h> 35 #include <asm/cpu_device_id.h> 36 #include <asm/debugreg.h> 37 #include <asm/desc.h> 38 #include <asm/fpu/api.h> 39 #include <asm/fpu/xstate.h> 40 #include <asm/idtentry.h> 41 #include <asm/io.h> 42 #include <asm/irq_remapping.h> 43 #include <asm/kexec.h> 44 #include <asm/perf_event.h> 45 #include <asm/mmu_context.h> 46 #include <asm/mshyperv.h> 47 #include <asm/mwait.h> 48 #include <asm/spec-ctrl.h> 49 #include <asm/virtext.h> 50 #include <asm/vmx.h> 51 52 #include "capabilities.h" 53 #include "cpuid.h" 54 #include "evmcs.h" 55 #include "hyperv.h" 56 #include "kvm_onhyperv.h" 57 #include "irq.h" 58 #include "kvm_cache_regs.h" 59 #include "lapic.h" 60 #include "mmu.h" 61 #include "nested.h" 62 #include "pmu.h" 63 #include "sgx.h" 64 #include "trace.h" 65 #include "vmcs.h" 66 #include "vmcs12.h" 67 #include "vmx.h" 68 #include "x86.h" 69 70 MODULE_AUTHOR("Qumranet"); 71 MODULE_LICENSE("GPL"); 72 73 #ifdef MODULE 74 static const struct x86_cpu_id vmx_cpu_id[] = { 75 X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL), 76 {} 77 }; 78 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 79 #endif 80 81 bool __read_mostly enable_vpid = 1; 82 module_param_named(vpid, enable_vpid, bool, 0444); 83 84 static bool __read_mostly enable_vnmi = 1; 85 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); 86 87 bool __read_mostly flexpriority_enabled = 1; 88 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 89 90 bool __read_mostly enable_ept = 1; 91 module_param_named(ept, enable_ept, bool, S_IRUGO); 92 93 bool __read_mostly enable_unrestricted_guest = 1; 94 module_param_named(unrestricted_guest, 95 enable_unrestricted_guest, bool, S_IRUGO); 96 97 bool __read_mostly enable_ept_ad_bits = 1; 98 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); 99 100 static bool __read_mostly emulate_invalid_guest_state = true; 101 module_param(emulate_invalid_guest_state, bool, S_IRUGO); 102 103 static bool __read_mostly fasteoi = 1; 104 module_param(fasteoi, bool, S_IRUGO); 105 106 module_param(enable_apicv, bool, S_IRUGO); 107 108 bool __read_mostly enable_ipiv = true; 109 module_param(enable_ipiv, bool, 0444); 110 111 /* 112 * If nested=1, nested virtualization is supported, i.e., guests may use 113 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 114 * use VMX instructions. 115 */ 116 static bool __read_mostly nested = 1; 117 module_param(nested, bool, S_IRUGO); 118 119 bool __read_mostly enable_pml = 1; 120 module_param_named(pml, enable_pml, bool, S_IRUGO); 121 122 static bool __read_mostly error_on_inconsistent_vmcs_config = true; 123 module_param(error_on_inconsistent_vmcs_config, bool, 0444); 124 125 static bool __read_mostly dump_invalid_vmcs = 0; 126 module_param(dump_invalid_vmcs, bool, 0644); 127 128 #define MSR_BITMAP_MODE_X2APIC 1 129 #define MSR_BITMAP_MODE_X2APIC_APICV 2 130 131 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 132 133 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 134 static int __read_mostly cpu_preemption_timer_multi; 135 static bool __read_mostly enable_preemption_timer = 1; 136 #ifdef CONFIG_X86_64 137 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 138 #endif 139 140 extern bool __read_mostly allow_smaller_maxphyaddr; 141 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 142 143 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 144 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 145 #define KVM_VM_CR0_ALWAYS_ON \ 146 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 147 148 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 149 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 150 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 151 152 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 153 154 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 155 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 156 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 157 RTIT_STATUS_BYTECNT)) 158 159 /* 160 * List of MSRs that can be directly passed to the guest. 161 * In addition to these x2apic and PT MSRs are handled specially. 162 */ 163 static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = { 164 MSR_IA32_SPEC_CTRL, 165 MSR_IA32_PRED_CMD, 166 MSR_IA32_TSC, 167 #ifdef CONFIG_X86_64 168 MSR_FS_BASE, 169 MSR_GS_BASE, 170 MSR_KERNEL_GS_BASE, 171 MSR_IA32_XFD, 172 MSR_IA32_XFD_ERR, 173 #endif 174 MSR_IA32_SYSENTER_CS, 175 MSR_IA32_SYSENTER_ESP, 176 MSR_IA32_SYSENTER_EIP, 177 MSR_CORE_C1_RES, 178 MSR_CORE_C3_RESIDENCY, 179 MSR_CORE_C6_RESIDENCY, 180 MSR_CORE_C7_RESIDENCY, 181 }; 182 183 /* 184 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 185 * ple_gap: upper bound on the amount of time between two successive 186 * executions of PAUSE in a loop. Also indicate if ple enabled. 187 * According to test, this time is usually smaller than 128 cycles. 188 * ple_window: upper bound on the amount of time a guest is allowed to execute 189 * in a PAUSE loop. Tests indicate that most spinlocks are held for 190 * less than 2^12 cycles 191 * Time is measured based on a counter that runs at the same rate as the TSC, 192 * refer SDM volume 3b section 21.6.13 & 22.1.3. 193 */ 194 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 195 module_param(ple_gap, uint, 0444); 196 197 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 198 module_param(ple_window, uint, 0444); 199 200 /* Default doubles per-vcpu window every exit. */ 201 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 202 module_param(ple_window_grow, uint, 0444); 203 204 /* Default resets per-vcpu window every exit to ple_window. */ 205 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 206 module_param(ple_window_shrink, uint, 0444); 207 208 /* Default is to compute the maximum so we can never overflow. */ 209 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 210 module_param(ple_window_max, uint, 0444); 211 212 /* Default is SYSTEM mode, 1 for host-guest mode */ 213 int __read_mostly pt_mode = PT_MODE_SYSTEM; 214 module_param(pt_mode, int, S_IRUGO); 215 216 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 217 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 218 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 219 220 /* Storage for pre module init parameter parsing */ 221 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 222 223 static const struct { 224 const char *option; 225 bool for_parse; 226 } vmentry_l1d_param[] = { 227 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 228 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 229 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 230 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 231 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 232 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 233 }; 234 235 #define L1D_CACHE_ORDER 4 236 static void *vmx_l1d_flush_pages; 237 238 /* Control for disabling CPU Fill buffer clear */ 239 static bool __read_mostly vmx_fb_clear_ctrl_available; 240 241 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 242 { 243 struct page *page; 244 unsigned int i; 245 246 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 247 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 248 return 0; 249 } 250 251 if (!enable_ept) { 252 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 253 return 0; 254 } 255 256 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { 257 u64 msr; 258 259 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); 260 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 261 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 262 return 0; 263 } 264 } 265 266 /* If set to auto use the default l1tf mitigation method */ 267 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 268 switch (l1tf_mitigation) { 269 case L1TF_MITIGATION_OFF: 270 l1tf = VMENTER_L1D_FLUSH_NEVER; 271 break; 272 case L1TF_MITIGATION_FLUSH_NOWARN: 273 case L1TF_MITIGATION_FLUSH: 274 case L1TF_MITIGATION_FLUSH_NOSMT: 275 l1tf = VMENTER_L1D_FLUSH_COND; 276 break; 277 case L1TF_MITIGATION_FULL: 278 case L1TF_MITIGATION_FULL_FORCE: 279 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 280 break; 281 } 282 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 283 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 284 } 285 286 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 287 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 288 /* 289 * This allocation for vmx_l1d_flush_pages is not tied to a VM 290 * lifetime and so should not be charged to a memcg. 291 */ 292 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 293 if (!page) 294 return -ENOMEM; 295 vmx_l1d_flush_pages = page_address(page); 296 297 /* 298 * Initialize each page with a different pattern in 299 * order to protect against KSM in the nested 300 * virtualization case. 301 */ 302 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 303 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 304 PAGE_SIZE); 305 } 306 } 307 308 l1tf_vmx_mitigation = l1tf; 309 310 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 311 static_branch_enable(&vmx_l1d_should_flush); 312 else 313 static_branch_disable(&vmx_l1d_should_flush); 314 315 if (l1tf == VMENTER_L1D_FLUSH_COND) 316 static_branch_enable(&vmx_l1d_flush_cond); 317 else 318 static_branch_disable(&vmx_l1d_flush_cond); 319 return 0; 320 } 321 322 static int vmentry_l1d_flush_parse(const char *s) 323 { 324 unsigned int i; 325 326 if (s) { 327 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 328 if (vmentry_l1d_param[i].for_parse && 329 sysfs_streq(s, vmentry_l1d_param[i].option)) 330 return i; 331 } 332 } 333 return -EINVAL; 334 } 335 336 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 337 { 338 int l1tf, ret; 339 340 l1tf = vmentry_l1d_flush_parse(s); 341 if (l1tf < 0) 342 return l1tf; 343 344 if (!boot_cpu_has(X86_BUG_L1TF)) 345 return 0; 346 347 /* 348 * Has vmx_init() run already? If not then this is the pre init 349 * parameter parsing. In that case just store the value and let 350 * vmx_init() do the proper setup after enable_ept has been 351 * established. 352 */ 353 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 354 vmentry_l1d_flush_param = l1tf; 355 return 0; 356 } 357 358 mutex_lock(&vmx_l1d_flush_mutex); 359 ret = vmx_setup_l1d_flush(l1tf); 360 mutex_unlock(&vmx_l1d_flush_mutex); 361 return ret; 362 } 363 364 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 365 { 366 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 367 return sprintf(s, "???\n"); 368 369 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 370 } 371 372 static void vmx_setup_fb_clear_ctrl(void) 373 { 374 u64 msr; 375 376 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) && 377 !boot_cpu_has_bug(X86_BUG_MDS) && 378 !boot_cpu_has_bug(X86_BUG_TAA)) { 379 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); 380 if (msr & ARCH_CAP_FB_CLEAR_CTRL) 381 vmx_fb_clear_ctrl_available = true; 382 } 383 } 384 385 static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) 386 { 387 u64 msr; 388 389 if (!vmx->disable_fb_clear) 390 return; 391 392 msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); 393 msr |= FB_CLEAR_DIS; 394 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); 395 /* Cache the MSR value to avoid reading it later */ 396 vmx->msr_ia32_mcu_opt_ctrl = msr; 397 } 398 399 static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) 400 { 401 if (!vmx->disable_fb_clear) 402 return; 403 404 vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; 405 native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); 406 } 407 408 static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) 409 { 410 vmx->disable_fb_clear = vmx_fb_clear_ctrl_available; 411 412 /* 413 * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS 414 * at VMEntry. Skip the MSR read/write when a guest has no use case to 415 * execute VERW. 416 */ 417 if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) || 418 ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) && 419 (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) && 420 (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) && 421 (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) && 422 (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO))) 423 vmx->disable_fb_clear = false; 424 } 425 426 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 427 .set = vmentry_l1d_flush_set, 428 .get = vmentry_l1d_flush_get, 429 }; 430 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 431 432 static u32 vmx_segment_access_rights(struct kvm_segment *var); 433 434 void vmx_vmexit(void); 435 436 #define vmx_insn_failed(fmt...) \ 437 do { \ 438 WARN_ONCE(1, fmt); \ 439 pr_warn_ratelimited(fmt); \ 440 } while (0) 441 442 asmlinkage void vmread_error(unsigned long field, bool fault) 443 { 444 if (fault) 445 kvm_spurious_fault(); 446 else 447 vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); 448 } 449 450 noinline void vmwrite_error(unsigned long field, unsigned long value) 451 { 452 vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n", 453 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 454 } 455 456 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 457 { 458 vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n", 459 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 460 } 461 462 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 463 { 464 vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n", 465 vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR)); 466 } 467 468 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 469 { 470 vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 471 ext, vpid, gva); 472 } 473 474 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) 475 { 476 vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", 477 ext, eptp, gpa); 478 } 479 480 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 481 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 482 /* 483 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 484 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 485 */ 486 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 487 488 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 489 static DEFINE_SPINLOCK(vmx_vpid_lock); 490 491 struct vmcs_config vmcs_config; 492 struct vmx_capability vmx_capability; 493 494 #define VMX_SEGMENT_FIELD(seg) \ 495 [VCPU_SREG_##seg] = { \ 496 .selector = GUEST_##seg##_SELECTOR, \ 497 .base = GUEST_##seg##_BASE, \ 498 .limit = GUEST_##seg##_LIMIT, \ 499 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 500 } 501 502 static const struct kvm_vmx_segment_field { 503 unsigned selector; 504 unsigned base; 505 unsigned limit; 506 unsigned ar_bytes; 507 } kvm_vmx_segment_fields[] = { 508 VMX_SEGMENT_FIELD(CS), 509 VMX_SEGMENT_FIELD(DS), 510 VMX_SEGMENT_FIELD(ES), 511 VMX_SEGMENT_FIELD(FS), 512 VMX_SEGMENT_FIELD(GS), 513 VMX_SEGMENT_FIELD(SS), 514 VMX_SEGMENT_FIELD(TR), 515 VMX_SEGMENT_FIELD(LDTR), 516 }; 517 518 static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 519 { 520 vmx->segment_cache.bitmask = 0; 521 } 522 523 static unsigned long host_idt_base; 524 525 #if IS_ENABLED(CONFIG_HYPERV) 526 static bool __read_mostly enlightened_vmcs = true; 527 module_param(enlightened_vmcs, bool, 0444); 528 529 static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) 530 { 531 struct hv_enlightened_vmcs *evmcs; 532 struct hv_partition_assist_pg **p_hv_pa_pg = 533 &to_kvm_hv(vcpu->kvm)->hv_pa_pg; 534 /* 535 * Synthetic VM-Exit is not enabled in current code and so All 536 * evmcs in singe VM shares same assist page. 537 */ 538 if (!*p_hv_pa_pg) 539 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); 540 541 if (!*p_hv_pa_pg) 542 return -ENOMEM; 543 544 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 545 546 evmcs->partition_assist_page = 547 __pa(*p_hv_pa_pg); 548 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 549 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 550 551 return 0; 552 } 553 554 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 555 556 /* 557 * Comment's format: document - errata name - stepping - processor name. 558 * Refer from 559 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 560 */ 561 static u32 vmx_preemption_cpu_tfms[] = { 562 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 563 0x000206E6, 564 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 565 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 566 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 567 0x00020652, 568 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 569 0x00020655, 570 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 571 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 572 /* 573 * 320767.pdf - AAP86 - B1 - 574 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 575 */ 576 0x000106E5, 577 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 578 0x000106A0, 579 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 580 0x000106A1, 581 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 582 0x000106A4, 583 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 584 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 585 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 586 0x000106A5, 587 /* Xeon E3-1220 V2 */ 588 0x000306A8, 589 }; 590 591 static inline bool cpu_has_broken_vmx_preemption_timer(void) 592 { 593 u32 eax = cpuid_eax(0x00000001), i; 594 595 /* Clear the reserved bits */ 596 eax &= ~(0x3U << 14 | 0xfU << 28); 597 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 598 if (eax == vmx_preemption_cpu_tfms[i]) 599 return true; 600 601 return false; 602 } 603 604 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 605 { 606 return flexpriority_enabled && lapic_in_kernel(vcpu); 607 } 608 609 static int possible_passthrough_msr_slot(u32 msr) 610 { 611 u32 i; 612 613 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) 614 if (vmx_possible_passthrough_msrs[i] == msr) 615 return i; 616 617 return -ENOENT; 618 } 619 620 static bool is_valid_passthrough_msr(u32 msr) 621 { 622 bool r; 623 624 switch (msr) { 625 case 0x800 ... 0x8ff: 626 /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */ 627 return true; 628 case MSR_IA32_RTIT_STATUS: 629 case MSR_IA32_RTIT_OUTPUT_BASE: 630 case MSR_IA32_RTIT_OUTPUT_MASK: 631 case MSR_IA32_RTIT_CR3_MATCH: 632 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 633 /* PT MSRs. These are handled in pt_update_intercept_for_msr() */ 634 case MSR_LBR_SELECT: 635 case MSR_LBR_TOS: 636 case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31: 637 case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31: 638 case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31: 639 case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8: 640 case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8: 641 /* LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() */ 642 return true; 643 } 644 645 r = possible_passthrough_msr_slot(msr) != -ENOENT; 646 647 WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr); 648 649 return r; 650 } 651 652 struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr) 653 { 654 int i; 655 656 i = kvm_find_user_return_msr(msr); 657 if (i >= 0) 658 return &vmx->guest_uret_msrs[i]; 659 return NULL; 660 } 661 662 static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx, 663 struct vmx_uret_msr *msr, u64 data) 664 { 665 unsigned int slot = msr - vmx->guest_uret_msrs; 666 int ret = 0; 667 668 if (msr->load_into_hardware) { 669 preempt_disable(); 670 ret = kvm_set_user_return_msr(slot, data, msr->mask); 671 preempt_enable(); 672 } 673 if (!ret) 674 msr->data = data; 675 return ret; 676 } 677 678 #ifdef CONFIG_KEXEC_CORE 679 static void crash_vmclear_local_loaded_vmcss(void) 680 { 681 int cpu = raw_smp_processor_id(); 682 struct loaded_vmcs *v; 683 684 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 685 loaded_vmcss_on_cpu_link) 686 vmcs_clear(v->vmcs); 687 } 688 #endif /* CONFIG_KEXEC_CORE */ 689 690 static void __loaded_vmcs_clear(void *arg) 691 { 692 struct loaded_vmcs *loaded_vmcs = arg; 693 int cpu = raw_smp_processor_id(); 694 695 if (loaded_vmcs->cpu != cpu) 696 return; /* vcpu migration can race with cpu offline */ 697 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 698 per_cpu(current_vmcs, cpu) = NULL; 699 700 vmcs_clear(loaded_vmcs->vmcs); 701 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 702 vmcs_clear(loaded_vmcs->shadow_vmcs); 703 704 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 705 706 /* 707 * Ensure all writes to loaded_vmcs, including deleting it from its 708 * current percpu list, complete before setting loaded_vmcs->cpu to 709 * -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first 710 * and add loaded_vmcs to its percpu list before it's deleted from this 711 * cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). 712 */ 713 smp_wmb(); 714 715 loaded_vmcs->cpu = -1; 716 loaded_vmcs->launched = 0; 717 } 718 719 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 720 { 721 int cpu = loaded_vmcs->cpu; 722 723 if (cpu != -1) 724 smp_call_function_single(cpu, 725 __loaded_vmcs_clear, loaded_vmcs, 1); 726 } 727 728 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 729 unsigned field) 730 { 731 bool ret; 732 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 733 734 if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { 735 kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); 736 vmx->segment_cache.bitmask = 0; 737 } 738 ret = vmx->segment_cache.bitmask & mask; 739 vmx->segment_cache.bitmask |= mask; 740 return ret; 741 } 742 743 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 744 { 745 u16 *p = &vmx->segment_cache.seg[seg].selector; 746 747 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 748 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 749 return *p; 750 } 751 752 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 753 { 754 ulong *p = &vmx->segment_cache.seg[seg].base; 755 756 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 757 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 758 return *p; 759 } 760 761 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 762 { 763 u32 *p = &vmx->segment_cache.seg[seg].limit; 764 765 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 766 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 767 return *p; 768 } 769 770 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 771 { 772 u32 *p = &vmx->segment_cache.seg[seg].ar; 773 774 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 775 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 776 return *p; 777 } 778 779 void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) 780 { 781 u32 eb; 782 783 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 784 (1u << DB_VECTOR) | (1u << AC_VECTOR); 785 /* 786 * Guest access to VMware backdoor ports could legitimately 787 * trigger #GP because of TSS I/O permission bitmap. 788 * We intercept those #GP and allow access to them anyway 789 * as VMware does. 790 */ 791 if (enable_vmware_backdoor) 792 eb |= (1u << GP_VECTOR); 793 if ((vcpu->guest_debug & 794 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 795 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 796 eb |= 1u << BP_VECTOR; 797 if (to_vmx(vcpu)->rmode.vm86_active) 798 eb = ~0; 799 if (!vmx_need_pf_intercept(vcpu)) 800 eb &= ~(1u << PF_VECTOR); 801 802 /* When we are running a nested L2 guest and L1 specified for it a 803 * certain exception bitmap, we must trap the same exceptions and pass 804 * them to L1. When running L2, we will only handle the exceptions 805 * specified above if L1 did not want them. 806 */ 807 if (is_guest_mode(vcpu)) 808 eb |= get_vmcs12(vcpu)->exception_bitmap; 809 else { 810 int mask = 0, match = 0; 811 812 if (enable_ept && (eb & (1u << PF_VECTOR))) { 813 /* 814 * If EPT is enabled, #PF is currently only intercepted 815 * if MAXPHYADDR is smaller on the guest than on the 816 * host. In that case we only care about present, 817 * non-reserved faults. For vmcs02, however, PFEC_MASK 818 * and PFEC_MATCH are set in prepare_vmcs02_rare. 819 */ 820 mask = PFERR_PRESENT_MASK | PFERR_RSVD_MASK; 821 match = PFERR_PRESENT_MASK; 822 } 823 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask); 824 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match); 825 } 826 827 /* 828 * Disabling xfd interception indicates that dynamic xfeatures 829 * might be used in the guest. Always trap #NM in this case 830 * to save guest xfd_err timely. 831 */ 832 if (vcpu->arch.xfd_no_write_intercept) 833 eb |= (1u << NM_VECTOR); 834 835 vmcs_write32(EXCEPTION_BITMAP, eb); 836 } 837 838 /* 839 * Check if MSR is intercepted for currently loaded MSR bitmap. 840 */ 841 static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) 842 { 843 if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) 844 return true; 845 846 return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); 847 } 848 849 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) 850 { 851 unsigned int flags = 0; 852 853 if (vmx->loaded_vmcs->launched) 854 flags |= VMX_RUN_VMRESUME; 855 856 /* 857 * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free 858 * to change it directly without causing a vmexit. In that case read 859 * it after vmexit and store it in vmx->spec_ctrl. 860 */ 861 if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) 862 flags |= VMX_RUN_SAVE_SPEC_CTRL; 863 864 return flags; 865 } 866 867 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 868 unsigned long entry, unsigned long exit) 869 { 870 vm_entry_controls_clearbit(vmx, entry); 871 vm_exit_controls_clearbit(vmx, exit); 872 } 873 874 int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 875 { 876 unsigned int i; 877 878 for (i = 0; i < m->nr; ++i) { 879 if (m->val[i].index == msr) 880 return i; 881 } 882 return -ENOENT; 883 } 884 885 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 886 { 887 int i; 888 struct msr_autoload *m = &vmx->msr_autoload; 889 890 switch (msr) { 891 case MSR_EFER: 892 if (cpu_has_load_ia32_efer()) { 893 clear_atomic_switch_msr_special(vmx, 894 VM_ENTRY_LOAD_IA32_EFER, 895 VM_EXIT_LOAD_IA32_EFER); 896 return; 897 } 898 break; 899 case MSR_CORE_PERF_GLOBAL_CTRL: 900 if (cpu_has_load_perf_global_ctrl()) { 901 clear_atomic_switch_msr_special(vmx, 902 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 903 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 904 return; 905 } 906 break; 907 } 908 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 909 if (i < 0) 910 goto skip_guest; 911 --m->guest.nr; 912 m->guest.val[i] = m->guest.val[m->guest.nr]; 913 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 914 915 skip_guest: 916 i = vmx_find_loadstore_msr_slot(&m->host, msr); 917 if (i < 0) 918 return; 919 920 --m->host.nr; 921 m->host.val[i] = m->host.val[m->host.nr]; 922 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 923 } 924 925 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 926 unsigned long entry, unsigned long exit, 927 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 928 u64 guest_val, u64 host_val) 929 { 930 vmcs_write64(guest_val_vmcs, guest_val); 931 if (host_val_vmcs != HOST_IA32_EFER) 932 vmcs_write64(host_val_vmcs, host_val); 933 vm_entry_controls_setbit(vmx, entry); 934 vm_exit_controls_setbit(vmx, exit); 935 } 936 937 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 938 u64 guest_val, u64 host_val, bool entry_only) 939 { 940 int i, j = 0; 941 struct msr_autoload *m = &vmx->msr_autoload; 942 943 switch (msr) { 944 case MSR_EFER: 945 if (cpu_has_load_ia32_efer()) { 946 add_atomic_switch_msr_special(vmx, 947 VM_ENTRY_LOAD_IA32_EFER, 948 VM_EXIT_LOAD_IA32_EFER, 949 GUEST_IA32_EFER, 950 HOST_IA32_EFER, 951 guest_val, host_val); 952 return; 953 } 954 break; 955 case MSR_CORE_PERF_GLOBAL_CTRL: 956 if (cpu_has_load_perf_global_ctrl()) { 957 add_atomic_switch_msr_special(vmx, 958 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 959 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 960 GUEST_IA32_PERF_GLOBAL_CTRL, 961 HOST_IA32_PERF_GLOBAL_CTRL, 962 guest_val, host_val); 963 return; 964 } 965 break; 966 case MSR_IA32_PEBS_ENABLE: 967 /* PEBS needs a quiescent period after being disabled (to write 968 * a record). Disabling PEBS through VMX MSR swapping doesn't 969 * provide that period, so a CPU could write host's record into 970 * guest's memory. 971 */ 972 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 973 } 974 975 i = vmx_find_loadstore_msr_slot(&m->guest, msr); 976 if (!entry_only) 977 j = vmx_find_loadstore_msr_slot(&m->host, msr); 978 979 if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 980 (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 981 printk_once(KERN_WARNING "Not enough msr switch entries. " 982 "Can't add msr %x\n", msr); 983 return; 984 } 985 if (i < 0) { 986 i = m->guest.nr++; 987 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 988 } 989 m->guest.val[i].index = msr; 990 m->guest.val[i].value = guest_val; 991 992 if (entry_only) 993 return; 994 995 if (j < 0) { 996 j = m->host.nr++; 997 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 998 } 999 m->host.val[j].index = msr; 1000 m->host.val[j].value = host_val; 1001 } 1002 1003 static bool update_transition_efer(struct vcpu_vmx *vmx) 1004 { 1005 u64 guest_efer = vmx->vcpu.arch.efer; 1006 u64 ignore_bits = 0; 1007 int i; 1008 1009 /* Shadow paging assumes NX to be available. */ 1010 if (!enable_ept) 1011 guest_efer |= EFER_NX; 1012 1013 /* 1014 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1015 */ 1016 ignore_bits |= EFER_SCE; 1017 #ifdef CONFIG_X86_64 1018 ignore_bits |= EFER_LMA | EFER_LME; 1019 /* SCE is meaningful only in long mode on Intel */ 1020 if (guest_efer & EFER_LMA) 1021 ignore_bits &= ~(u64)EFER_SCE; 1022 #endif 1023 1024 /* 1025 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1026 * On CPUs that support "load IA32_EFER", always switch EFER 1027 * atomically, since it's faster than switching it manually. 1028 */ 1029 if (cpu_has_load_ia32_efer() || 1030 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { 1031 if (!(guest_efer & EFER_LMA)) 1032 guest_efer &= ~EFER_LME; 1033 if (guest_efer != host_efer) 1034 add_atomic_switch_msr(vmx, MSR_EFER, 1035 guest_efer, host_efer, false); 1036 else 1037 clear_atomic_switch_msr(vmx, MSR_EFER); 1038 return false; 1039 } 1040 1041 i = kvm_find_user_return_msr(MSR_EFER); 1042 if (i < 0) 1043 return false; 1044 1045 clear_atomic_switch_msr(vmx, MSR_EFER); 1046 1047 guest_efer &= ~ignore_bits; 1048 guest_efer |= host_efer & ignore_bits; 1049 1050 vmx->guest_uret_msrs[i].data = guest_efer; 1051 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1052 1053 return true; 1054 } 1055 1056 #ifdef CONFIG_X86_32 1057 /* 1058 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1059 * VMCS rather than the segment table. KVM uses this helper to figure 1060 * out the current bases to poke them into the VMCS before entry. 1061 */ 1062 static unsigned long segment_base(u16 selector) 1063 { 1064 struct desc_struct *table; 1065 unsigned long v; 1066 1067 if (!(selector & ~SEGMENT_RPL_MASK)) 1068 return 0; 1069 1070 table = get_current_gdt_ro(); 1071 1072 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1073 u16 ldt_selector = kvm_read_ldt(); 1074 1075 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1076 return 0; 1077 1078 table = (struct desc_struct *)segment_base(ldt_selector); 1079 } 1080 v = get_desc_base(&table[selector >> 3]); 1081 return v; 1082 } 1083 #endif 1084 1085 static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) 1086 { 1087 return vmx_pt_mode_is_host_guest() && 1088 !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 1089 } 1090 1091 static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base) 1092 { 1093 /* The base must be 128-byte aligned and a legal physical address. */ 1094 return kvm_vcpu_is_legal_aligned_gpa(vcpu, base, 128); 1095 } 1096 1097 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1098 { 1099 u32 i; 1100 1101 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1102 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1103 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1104 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1105 for (i = 0; i < addr_range; i++) { 1106 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1107 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1108 } 1109 } 1110 1111 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1112 { 1113 u32 i; 1114 1115 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1116 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1117 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1118 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1119 for (i = 0; i < addr_range; i++) { 1120 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1121 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1122 } 1123 } 1124 1125 static void pt_guest_enter(struct vcpu_vmx *vmx) 1126 { 1127 if (vmx_pt_mode_is_system()) 1128 return; 1129 1130 /* 1131 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1132 * Save host state before VM entry. 1133 */ 1134 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1135 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1136 wrmsrl(MSR_IA32_RTIT_CTL, 0); 1137 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1138 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1139 } 1140 } 1141 1142 static void pt_guest_exit(struct vcpu_vmx *vmx) 1143 { 1144 if (vmx_pt_mode_is_system()) 1145 return; 1146 1147 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1148 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges); 1149 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges); 1150 } 1151 1152 /* 1153 * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest, 1154 * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary. 1155 */ 1156 if (vmx->pt_desc.host.ctl) 1157 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1158 } 1159 1160 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1161 unsigned long fs_base, unsigned long gs_base) 1162 { 1163 if (unlikely(fs_sel != host->fs_sel)) { 1164 if (!(fs_sel & 7)) 1165 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1166 else 1167 vmcs_write16(HOST_FS_SELECTOR, 0); 1168 host->fs_sel = fs_sel; 1169 } 1170 if (unlikely(gs_sel != host->gs_sel)) { 1171 if (!(gs_sel & 7)) 1172 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1173 else 1174 vmcs_write16(HOST_GS_SELECTOR, 0); 1175 host->gs_sel = gs_sel; 1176 } 1177 if (unlikely(fs_base != host->fs_base)) { 1178 vmcs_writel(HOST_FS_BASE, fs_base); 1179 host->fs_base = fs_base; 1180 } 1181 if (unlikely(gs_base != host->gs_base)) { 1182 vmcs_writel(HOST_GS_BASE, gs_base); 1183 host->gs_base = gs_base; 1184 } 1185 } 1186 1187 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1188 { 1189 struct vcpu_vmx *vmx = to_vmx(vcpu); 1190 struct vmcs_host_state *host_state; 1191 #ifdef CONFIG_X86_64 1192 int cpu = raw_smp_processor_id(); 1193 #endif 1194 unsigned long fs_base, gs_base; 1195 u16 fs_sel, gs_sel; 1196 int i; 1197 1198 vmx->req_immediate_exit = false; 1199 1200 /* 1201 * Note that guest MSRs to be saved/restored can also be changed 1202 * when guest state is loaded. This happens when guest transitions 1203 * to/from long-mode by setting MSR_EFER.LMA. 1204 */ 1205 if (!vmx->guest_uret_msrs_loaded) { 1206 vmx->guest_uret_msrs_loaded = true; 1207 for (i = 0; i < kvm_nr_uret_msrs; ++i) { 1208 if (!vmx->guest_uret_msrs[i].load_into_hardware) 1209 continue; 1210 1211 kvm_set_user_return_msr(i, 1212 vmx->guest_uret_msrs[i].data, 1213 vmx->guest_uret_msrs[i].mask); 1214 } 1215 } 1216 1217 if (vmx->nested.need_vmcs12_to_shadow_sync) 1218 nested_sync_vmcs12_to_shadow(vcpu); 1219 1220 if (vmx->guest_state_loaded) 1221 return; 1222 1223 host_state = &vmx->loaded_vmcs->host_state; 1224 1225 /* 1226 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1227 * allow segment selectors with cpl > 0 or ti == 1. 1228 */ 1229 host_state->ldt_sel = kvm_read_ldt(); 1230 1231 #ifdef CONFIG_X86_64 1232 savesegment(ds, host_state->ds_sel); 1233 savesegment(es, host_state->es_sel); 1234 1235 gs_base = cpu_kernelmode_gs_base(cpu); 1236 if (likely(is_64bit_mm(current->mm))) { 1237 current_save_fsgs(); 1238 fs_sel = current->thread.fsindex; 1239 gs_sel = current->thread.gsindex; 1240 fs_base = current->thread.fsbase; 1241 vmx->msr_host_kernel_gs_base = current->thread.gsbase; 1242 } else { 1243 savesegment(fs, fs_sel); 1244 savesegment(gs, gs_sel); 1245 fs_base = read_msr(MSR_FS_BASE); 1246 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1247 } 1248 1249 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1250 #else 1251 savesegment(fs, fs_sel); 1252 savesegment(gs, gs_sel); 1253 fs_base = segment_base(fs_sel); 1254 gs_base = segment_base(gs_sel); 1255 #endif 1256 1257 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1258 vmx->guest_state_loaded = true; 1259 } 1260 1261 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1262 { 1263 struct vmcs_host_state *host_state; 1264 1265 if (!vmx->guest_state_loaded) 1266 return; 1267 1268 host_state = &vmx->loaded_vmcs->host_state; 1269 1270 ++vmx->vcpu.stat.host_state_reload; 1271 1272 #ifdef CONFIG_X86_64 1273 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1274 #endif 1275 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1276 kvm_load_ldt(host_state->ldt_sel); 1277 #ifdef CONFIG_X86_64 1278 load_gs_index(host_state->gs_sel); 1279 #else 1280 loadsegment(gs, host_state->gs_sel); 1281 #endif 1282 } 1283 if (host_state->fs_sel & 7) 1284 loadsegment(fs, host_state->fs_sel); 1285 #ifdef CONFIG_X86_64 1286 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1287 loadsegment(ds, host_state->ds_sel); 1288 loadsegment(es, host_state->es_sel); 1289 } 1290 #endif 1291 invalidate_tss_limit(); 1292 #ifdef CONFIG_X86_64 1293 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1294 #endif 1295 load_fixmap_gdt(raw_smp_processor_id()); 1296 vmx->guest_state_loaded = false; 1297 vmx->guest_uret_msrs_loaded = false; 1298 } 1299 1300 #ifdef CONFIG_X86_64 1301 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1302 { 1303 preempt_disable(); 1304 if (vmx->guest_state_loaded) 1305 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1306 preempt_enable(); 1307 return vmx->msr_guest_kernel_gs_base; 1308 } 1309 1310 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1311 { 1312 preempt_disable(); 1313 if (vmx->guest_state_loaded) 1314 wrmsrl(MSR_KERNEL_GS_BASE, data); 1315 preempt_enable(); 1316 vmx->msr_guest_kernel_gs_base = data; 1317 } 1318 #endif 1319 1320 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu, 1321 struct loaded_vmcs *buddy) 1322 { 1323 struct vcpu_vmx *vmx = to_vmx(vcpu); 1324 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1325 struct vmcs *prev; 1326 1327 if (!already_loaded) { 1328 loaded_vmcs_clear(vmx->loaded_vmcs); 1329 local_irq_disable(); 1330 1331 /* 1332 * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to 1333 * this cpu's percpu list, otherwise it may not yet be deleted 1334 * from its previous cpu's percpu list. Pairs with the 1335 * smb_wmb() in __loaded_vmcs_clear(). 1336 */ 1337 smp_rmb(); 1338 1339 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1340 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1341 local_irq_enable(); 1342 } 1343 1344 prev = per_cpu(current_vmcs, cpu); 1345 if (prev != vmx->loaded_vmcs->vmcs) { 1346 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1347 vmcs_load(vmx->loaded_vmcs->vmcs); 1348 1349 /* 1350 * No indirect branch prediction barrier needed when switching 1351 * the active VMCS within a guest, e.g. on nested VM-Enter. 1352 * The L1 VMM can protect itself with retpolines, IBPB or IBRS. 1353 */ 1354 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) 1355 indirect_branch_prediction_barrier(); 1356 } 1357 1358 if (!already_loaded) { 1359 void *gdt = get_current_gdt_ro(); 1360 1361 /* 1362 * Flush all EPTP/VPID contexts, the new pCPU may have stale 1363 * TLB entries from its previous association with the vCPU. 1364 */ 1365 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1366 1367 /* 1368 * Linux uses per-cpu TSS and GDT, so set these when switching 1369 * processors. See 22.2.4. 1370 */ 1371 vmcs_writel(HOST_TR_BASE, 1372 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1373 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1374 1375 if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) { 1376 /* 22.2.3 */ 1377 vmcs_writel(HOST_IA32_SYSENTER_ESP, 1378 (unsigned long)(cpu_entry_stack(cpu) + 1)); 1379 } 1380 1381 vmx->loaded_vmcs->cpu = cpu; 1382 } 1383 } 1384 1385 /* 1386 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1387 * vcpu mutex is already taken. 1388 */ 1389 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1390 { 1391 struct vcpu_vmx *vmx = to_vmx(vcpu); 1392 1393 vmx_vcpu_load_vmcs(vcpu, cpu, NULL); 1394 1395 vmx_vcpu_pi_load(vcpu, cpu); 1396 1397 vmx->host_debugctlmsr = get_debugctlmsr(); 1398 } 1399 1400 static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1401 { 1402 vmx_vcpu_pi_put(vcpu); 1403 1404 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1405 } 1406 1407 bool vmx_emulation_required(struct kvm_vcpu *vcpu) 1408 { 1409 return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu); 1410 } 1411 1412 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1413 { 1414 struct vcpu_vmx *vmx = to_vmx(vcpu); 1415 unsigned long rflags, save_rflags; 1416 1417 if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { 1418 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1419 rflags = vmcs_readl(GUEST_RFLAGS); 1420 if (vmx->rmode.vm86_active) { 1421 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1422 save_rflags = vmx->rmode.save_rflags; 1423 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1424 } 1425 vmx->rflags = rflags; 1426 } 1427 return vmx->rflags; 1428 } 1429 1430 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1431 { 1432 struct vcpu_vmx *vmx = to_vmx(vcpu); 1433 unsigned long old_rflags; 1434 1435 if (is_unrestricted_guest(vcpu)) { 1436 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); 1437 vmx->rflags = rflags; 1438 vmcs_writel(GUEST_RFLAGS, rflags); 1439 return; 1440 } 1441 1442 old_rflags = vmx_get_rflags(vcpu); 1443 vmx->rflags = rflags; 1444 if (vmx->rmode.vm86_active) { 1445 vmx->rmode.save_rflags = rflags; 1446 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1447 } 1448 vmcs_writel(GUEST_RFLAGS, rflags); 1449 1450 if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) 1451 vmx->emulation_required = vmx_emulation_required(vcpu); 1452 } 1453 1454 static bool vmx_get_if_flag(struct kvm_vcpu *vcpu) 1455 { 1456 return vmx_get_rflags(vcpu) & X86_EFLAGS_IF; 1457 } 1458 1459 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1460 { 1461 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1462 int ret = 0; 1463 1464 if (interruptibility & GUEST_INTR_STATE_STI) 1465 ret |= KVM_X86_SHADOW_INT_STI; 1466 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1467 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1468 1469 return ret; 1470 } 1471 1472 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1473 { 1474 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1475 u32 interruptibility = interruptibility_old; 1476 1477 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1478 1479 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1480 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1481 else if (mask & KVM_X86_SHADOW_INT_STI) 1482 interruptibility |= GUEST_INTR_STATE_STI; 1483 1484 if ((interruptibility != interruptibility_old)) 1485 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1486 } 1487 1488 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1489 { 1490 struct vcpu_vmx *vmx = to_vmx(vcpu); 1491 unsigned long value; 1492 1493 /* 1494 * Any MSR write that attempts to change bits marked reserved will 1495 * case a #GP fault. 1496 */ 1497 if (data & vmx->pt_desc.ctl_bitmask) 1498 return 1; 1499 1500 /* 1501 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1502 * result in a #GP unless the same write also clears TraceEn. 1503 */ 1504 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1505 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) 1506 return 1; 1507 1508 /* 1509 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1510 * and FabricEn would cause #GP, if 1511 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1512 */ 1513 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1514 !(data & RTIT_CTL_FABRIC_EN) && 1515 !intel_pt_validate_cap(vmx->pt_desc.caps, 1516 PT_CAP_single_range_output)) 1517 return 1; 1518 1519 /* 1520 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1521 * utilize encodings marked reserved will cause a #GP fault. 1522 */ 1523 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1524 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1525 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1526 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1527 return 1; 1528 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1529 PT_CAP_cycle_thresholds); 1530 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1531 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1532 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1533 return 1; 1534 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1535 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1536 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1537 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1538 return 1; 1539 1540 /* 1541 * If ADDRx_CFG is reserved or the encodings is >2 will 1542 * cause a #GP fault. 1543 */ 1544 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1545 if ((value && (vmx->pt_desc.num_address_ranges < 1)) || (value > 2)) 1546 return 1; 1547 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1548 if ((value && (vmx->pt_desc.num_address_ranges < 2)) || (value > 2)) 1549 return 1; 1550 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1551 if ((value && (vmx->pt_desc.num_address_ranges < 3)) || (value > 2)) 1552 return 1; 1553 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1554 if ((value && (vmx->pt_desc.num_address_ranges < 4)) || (value > 2)) 1555 return 1; 1556 1557 return 0; 1558 } 1559 1560 static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 1561 void *insn, int insn_len) 1562 { 1563 /* 1564 * Emulation of instructions in SGX enclaves is impossible as RIP does 1565 * not point at the failing instruction, and even if it did, the code 1566 * stream is inaccessible. Inject #UD instead of exiting to userspace 1567 * so that guest userspace can't DoS the guest simply by triggering 1568 * emulation (enclaves are CPL3 only). 1569 */ 1570 if (to_vmx(vcpu)->exit_reason.enclave_mode) { 1571 kvm_queue_exception(vcpu, UD_VECTOR); 1572 return false; 1573 } 1574 return true; 1575 } 1576 1577 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1578 { 1579 union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason; 1580 unsigned long rip, orig_rip; 1581 u32 instr_len; 1582 1583 /* 1584 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1585 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1586 * set when EPT misconfig occurs. In practice, real hardware updates 1587 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1588 * (namely Hyper-V) don't set it due to it being undefined behavior, 1589 * i.e. we end up advancing IP with some random value. 1590 */ 1591 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1592 exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) { 1593 instr_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1594 1595 /* 1596 * Emulating an enclave's instructions isn't supported as KVM 1597 * cannot access the enclave's memory or its true RIP, e.g. the 1598 * vmcs.GUEST_RIP points at the exit point of the enclave, not 1599 * the RIP that actually triggered the VM-Exit. But, because 1600 * most instructions that cause VM-Exit will #UD in an enclave, 1601 * most instruction-based VM-Exits simply do not occur. 1602 * 1603 * There are a few exceptions, notably the debug instructions 1604 * INT1ICEBRK and INT3, as they are allowed in debug enclaves 1605 * and generate #DB/#BP as expected, which KVM might intercept. 1606 * But again, the CPU does the dirty work and saves an instr 1607 * length of zero so VMMs don't shoot themselves in the foot. 1608 * WARN if KVM tries to skip a non-zero length instruction on 1609 * a VM-Exit from an enclave. 1610 */ 1611 if (!instr_len) 1612 goto rip_updated; 1613 1614 WARN(exit_reason.enclave_mode, 1615 "KVM: skipping instruction after SGX enclave VM-Exit"); 1616 1617 orig_rip = kvm_rip_read(vcpu); 1618 rip = orig_rip + instr_len; 1619 #ifdef CONFIG_X86_64 1620 /* 1621 * We need to mask out the high 32 bits of RIP if not in 64-bit 1622 * mode, but just finding out that we are in 64-bit mode is 1623 * quite expensive. Only do it if there was a carry. 1624 */ 1625 if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu)) 1626 rip = (u32)rip; 1627 #endif 1628 kvm_rip_write(vcpu, rip); 1629 } else { 1630 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1631 return 0; 1632 } 1633 1634 rip_updated: 1635 /* skipping an emulated instruction also counts */ 1636 vmx_set_interrupt_shadow(vcpu, 0); 1637 1638 return 1; 1639 } 1640 1641 /* 1642 * Recognizes a pending MTF VM-exit and records the nested state for later 1643 * delivery. 1644 */ 1645 static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu) 1646 { 1647 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1648 struct vcpu_vmx *vmx = to_vmx(vcpu); 1649 1650 if (!is_guest_mode(vcpu)) 1651 return; 1652 1653 /* 1654 * Per the SDM, MTF takes priority over debug-trap exceptions besides 1655 * T-bit traps. As instruction emulation is completed (i.e. at the 1656 * instruction boundary), any #DB exception pending delivery must be a 1657 * debug-trap. Record the pending MTF state to be delivered in 1658 * vmx_check_nested_events(). 1659 */ 1660 if (nested_cpu_has_mtf(vmcs12) && 1661 (!vcpu->arch.exception.pending || 1662 vcpu->arch.exception.nr == DB_VECTOR)) 1663 vmx->nested.mtf_pending = true; 1664 else 1665 vmx->nested.mtf_pending = false; 1666 } 1667 1668 static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu) 1669 { 1670 vmx_update_emulated_instruction(vcpu); 1671 return skip_emulated_instruction(vcpu); 1672 } 1673 1674 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1675 { 1676 /* 1677 * Ensure that we clear the HLT state in the VMCS. We don't need to 1678 * explicitly skip the instruction because if the HLT state is set, 1679 * then the instruction is already executing and RIP has already been 1680 * advanced. 1681 */ 1682 if (kvm_hlt_in_guest(vcpu->kvm) && 1683 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1684 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1685 } 1686 1687 static void vmx_queue_exception(struct kvm_vcpu *vcpu) 1688 { 1689 struct vcpu_vmx *vmx = to_vmx(vcpu); 1690 unsigned nr = vcpu->arch.exception.nr; 1691 bool has_error_code = vcpu->arch.exception.has_error_code; 1692 u32 error_code = vcpu->arch.exception.error_code; 1693 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1694 1695 kvm_deliver_exception_payload(vcpu); 1696 1697 if (has_error_code) { 1698 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 1699 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1700 } 1701 1702 if (vmx->rmode.vm86_active) { 1703 int inc_eip = 0; 1704 if (kvm_exception_is_soft(nr)) 1705 inc_eip = vcpu->arch.event_exit_inst_len; 1706 kvm_inject_realmode_interrupt(vcpu, nr, inc_eip); 1707 return; 1708 } 1709 1710 WARN_ON_ONCE(vmx->emulation_required); 1711 1712 if (kvm_exception_is_soft(nr)) { 1713 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1714 vmx->vcpu.arch.event_exit_inst_len); 1715 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1716 } else 1717 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1718 1719 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1720 1721 vmx_clear_hlt(vcpu); 1722 } 1723 1724 static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, 1725 bool load_into_hardware) 1726 { 1727 struct vmx_uret_msr *uret_msr; 1728 1729 uret_msr = vmx_find_uret_msr(vmx, msr); 1730 if (!uret_msr) 1731 return; 1732 1733 uret_msr->load_into_hardware = load_into_hardware; 1734 } 1735 1736 /* 1737 * Configuring user return MSRs to automatically save, load, and restore MSRs 1738 * that need to be shoved into hardware when running the guest. Note, omitting 1739 * an MSR here does _NOT_ mean it's not emulated, only that it will not be 1740 * loaded into hardware when running the guest. 1741 */ 1742 static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) 1743 { 1744 #ifdef CONFIG_X86_64 1745 bool load_syscall_msrs; 1746 1747 /* 1748 * The SYSCALL MSRs are only needed on long mode guests, and only 1749 * when EFER.SCE is set. 1750 */ 1751 load_syscall_msrs = is_long_mode(&vmx->vcpu) && 1752 (vmx->vcpu.arch.efer & EFER_SCE); 1753 1754 vmx_setup_uret_msr(vmx, MSR_STAR, load_syscall_msrs); 1755 vmx_setup_uret_msr(vmx, MSR_LSTAR, load_syscall_msrs); 1756 vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_syscall_msrs); 1757 #endif 1758 vmx_setup_uret_msr(vmx, MSR_EFER, update_transition_efer(vmx)); 1759 1760 vmx_setup_uret_msr(vmx, MSR_TSC_AUX, 1761 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP) || 1762 guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDPID)); 1763 1764 /* 1765 * hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new 1766 * kernel and old userspace. If those guests run on a tsx=off host, do 1767 * allow guests to use TSX_CTRL, but don't change the value in hardware 1768 * so that TSX remains always disabled. 1769 */ 1770 vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); 1771 1772 /* 1773 * The set of MSRs to load may have changed, reload MSRs before the 1774 * next VM-Enter. 1775 */ 1776 vmx->guest_uret_msrs_loaded = false; 1777 } 1778 1779 u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1780 { 1781 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1782 1783 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) 1784 return vmcs12->tsc_offset; 1785 1786 return 0; 1787 } 1788 1789 u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1790 { 1791 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1792 1793 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) && 1794 nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 1795 return vmcs12->tsc_multiplier; 1796 1797 return kvm_caps.default_tsc_scaling_ratio; 1798 } 1799 1800 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1801 { 1802 vmcs_write64(TSC_OFFSET, offset); 1803 } 1804 1805 static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) 1806 { 1807 vmcs_write64(TSC_MULTIPLIER, multiplier); 1808 } 1809 1810 /* 1811 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX 1812 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for 1813 * all guests if the "nested" module option is off, and can also be disabled 1814 * for a single guest by disabling its VMX cpuid bit. 1815 */ 1816 bool nested_vmx_allowed(struct kvm_vcpu *vcpu) 1817 { 1818 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); 1819 } 1820 1821 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, 1822 uint64_t val) 1823 { 1824 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; 1825 1826 return !(val & ~valid_bits); 1827 } 1828 1829 static int vmx_get_msr_feature(struct kvm_msr_entry *msr) 1830 { 1831 switch (msr->index) { 1832 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 1833 if (!nested) 1834 return 1; 1835 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); 1836 case MSR_IA32_PERF_CAPABILITIES: 1837 msr->data = vmx_get_perf_capabilities(); 1838 return 0; 1839 default: 1840 return KVM_MSR_RET_INVALID; 1841 } 1842 } 1843 1844 /* 1845 * Reads an msr value (of 'msr_info->index') into 'msr_info->data'. 1846 * Returns 0 on success, non-0 otherwise. 1847 * Assumes vcpu_load() was already called. 1848 */ 1849 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1850 { 1851 struct vcpu_vmx *vmx = to_vmx(vcpu); 1852 struct vmx_uret_msr *msr; 1853 u32 index; 1854 1855 switch (msr_info->index) { 1856 #ifdef CONFIG_X86_64 1857 case MSR_FS_BASE: 1858 msr_info->data = vmcs_readl(GUEST_FS_BASE); 1859 break; 1860 case MSR_GS_BASE: 1861 msr_info->data = vmcs_readl(GUEST_GS_BASE); 1862 break; 1863 case MSR_KERNEL_GS_BASE: 1864 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 1865 break; 1866 #endif 1867 case MSR_EFER: 1868 return kvm_get_msr_common(vcpu, msr_info); 1869 case MSR_IA32_TSX_CTRL: 1870 if (!msr_info->host_initiated && 1871 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 1872 return 1; 1873 goto find_uret_msr; 1874 case MSR_IA32_UMWAIT_CONTROL: 1875 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 1876 return 1; 1877 1878 msr_info->data = vmx->msr_ia32_umwait_control; 1879 break; 1880 case MSR_IA32_SPEC_CTRL: 1881 if (!msr_info->host_initiated && 1882 !guest_has_spec_ctrl_msr(vcpu)) 1883 return 1; 1884 1885 msr_info->data = to_vmx(vcpu)->spec_ctrl; 1886 break; 1887 case MSR_IA32_SYSENTER_CS: 1888 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 1889 break; 1890 case MSR_IA32_SYSENTER_EIP: 1891 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 1892 break; 1893 case MSR_IA32_SYSENTER_ESP: 1894 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 1895 break; 1896 case MSR_IA32_BNDCFGS: 1897 if (!kvm_mpx_supported() || 1898 (!msr_info->host_initiated && 1899 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 1900 return 1; 1901 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 1902 break; 1903 case MSR_IA32_MCG_EXT_CTL: 1904 if (!msr_info->host_initiated && 1905 !(vmx->msr_ia32_feature_control & 1906 FEAT_CTL_LMCE_ENABLED)) 1907 return 1; 1908 msr_info->data = vcpu->arch.mcg_ext_ctl; 1909 break; 1910 case MSR_IA32_FEAT_CTL: 1911 msr_info->data = vmx->msr_ia32_feature_control; 1912 break; 1913 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 1914 if (!msr_info->host_initiated && 1915 !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) 1916 return 1; 1917 msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash 1918 [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; 1919 break; 1920 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 1921 if (!nested_vmx_allowed(vcpu)) 1922 return 1; 1923 if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 1924 &msr_info->data)) 1925 return 1; 1926 /* 1927 * Enlightened VMCS v1 doesn't have certain VMCS fields but 1928 * instead of just ignoring the features, different Hyper-V 1929 * versions are either trying to use them and fail or do some 1930 * sanity checking and refuse to boot. Filter all unsupported 1931 * features out. 1932 */ 1933 if (!msr_info->host_initiated && 1934 vmx->nested.enlightened_vmcs_enabled) 1935 nested_evmcs_filter_control_msr(msr_info->index, 1936 &msr_info->data); 1937 break; 1938 case MSR_IA32_RTIT_CTL: 1939 if (!vmx_pt_mode_is_host_guest()) 1940 return 1; 1941 msr_info->data = vmx->pt_desc.guest.ctl; 1942 break; 1943 case MSR_IA32_RTIT_STATUS: 1944 if (!vmx_pt_mode_is_host_guest()) 1945 return 1; 1946 msr_info->data = vmx->pt_desc.guest.status; 1947 break; 1948 case MSR_IA32_RTIT_CR3_MATCH: 1949 if (!vmx_pt_mode_is_host_guest() || 1950 !intel_pt_validate_cap(vmx->pt_desc.caps, 1951 PT_CAP_cr3_filtering)) 1952 return 1; 1953 msr_info->data = vmx->pt_desc.guest.cr3_match; 1954 break; 1955 case MSR_IA32_RTIT_OUTPUT_BASE: 1956 if (!vmx_pt_mode_is_host_guest() || 1957 (!intel_pt_validate_cap(vmx->pt_desc.caps, 1958 PT_CAP_topa_output) && 1959 !intel_pt_validate_cap(vmx->pt_desc.caps, 1960 PT_CAP_single_range_output))) 1961 return 1; 1962 msr_info->data = vmx->pt_desc.guest.output_base; 1963 break; 1964 case MSR_IA32_RTIT_OUTPUT_MASK: 1965 if (!vmx_pt_mode_is_host_guest() || 1966 (!intel_pt_validate_cap(vmx->pt_desc.caps, 1967 PT_CAP_topa_output) && 1968 !intel_pt_validate_cap(vmx->pt_desc.caps, 1969 PT_CAP_single_range_output))) 1970 return 1; 1971 msr_info->data = vmx->pt_desc.guest.output_mask; 1972 break; 1973 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 1974 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 1975 if (!vmx_pt_mode_is_host_guest() || 1976 (index >= 2 * vmx->pt_desc.num_address_ranges)) 1977 return 1; 1978 if (index % 2) 1979 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 1980 else 1981 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 1982 break; 1983 case MSR_IA32_DEBUGCTLMSR: 1984 msr_info->data = vmcs_read64(GUEST_IA32_DEBUGCTL); 1985 break; 1986 default: 1987 find_uret_msr: 1988 msr = vmx_find_uret_msr(vmx, msr_info->index); 1989 if (msr) { 1990 msr_info->data = msr->data; 1991 break; 1992 } 1993 return kvm_get_msr_common(vcpu, msr_info); 1994 } 1995 1996 return 0; 1997 } 1998 1999 static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu, 2000 u64 data) 2001 { 2002 #ifdef CONFIG_X86_64 2003 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) 2004 return (u32)data; 2005 #endif 2006 return (unsigned long)data; 2007 } 2008 2009 static u64 vcpu_supported_debugctl(struct kvm_vcpu *vcpu) 2010 { 2011 u64 debugctl = vmx_supported_debugctl(); 2012 2013 if (!intel_pmu_lbr_is_enabled(vcpu)) 2014 debugctl &= ~DEBUGCTLMSR_LBR_MASK; 2015 2016 if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) 2017 debugctl &= ~DEBUGCTLMSR_BUS_LOCK_DETECT; 2018 2019 return debugctl; 2020 } 2021 2022 /* 2023 * Writes msr value into the appropriate "register". 2024 * Returns 0 on success, non-0 otherwise. 2025 * Assumes vcpu_load() was already called. 2026 */ 2027 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2028 { 2029 struct vcpu_vmx *vmx = to_vmx(vcpu); 2030 struct vmx_uret_msr *msr; 2031 int ret = 0; 2032 u32 msr_index = msr_info->index; 2033 u64 data = msr_info->data; 2034 u32 index; 2035 2036 switch (msr_index) { 2037 case MSR_EFER: 2038 ret = kvm_set_msr_common(vcpu, msr_info); 2039 break; 2040 #ifdef CONFIG_X86_64 2041 case MSR_FS_BASE: 2042 vmx_segment_cache_clear(vmx); 2043 vmcs_writel(GUEST_FS_BASE, data); 2044 break; 2045 case MSR_GS_BASE: 2046 vmx_segment_cache_clear(vmx); 2047 vmcs_writel(GUEST_GS_BASE, data); 2048 break; 2049 case MSR_KERNEL_GS_BASE: 2050 vmx_write_guest_kernel_gs_base(vmx, data); 2051 break; 2052 case MSR_IA32_XFD: 2053 ret = kvm_set_msr_common(vcpu, msr_info); 2054 /* 2055 * Always intercepting WRMSR could incur non-negligible 2056 * overhead given xfd might be changed frequently in 2057 * guest context switch. Disable write interception 2058 * upon the first write with a non-zero value (indicating 2059 * potential usage on dynamic xfeatures). Also update 2060 * exception bitmap to trap #NM for proper virtualization 2061 * of guest xfd_err. 2062 */ 2063 if (!ret && data) { 2064 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD, 2065 MSR_TYPE_RW); 2066 vcpu->arch.xfd_no_write_intercept = true; 2067 vmx_update_exception_bitmap(vcpu); 2068 } 2069 break; 2070 #endif 2071 case MSR_IA32_SYSENTER_CS: 2072 if (is_guest_mode(vcpu)) 2073 get_vmcs12(vcpu)->guest_sysenter_cs = data; 2074 vmcs_write32(GUEST_SYSENTER_CS, data); 2075 break; 2076 case MSR_IA32_SYSENTER_EIP: 2077 if (is_guest_mode(vcpu)) { 2078 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2079 get_vmcs12(vcpu)->guest_sysenter_eip = data; 2080 } 2081 vmcs_writel(GUEST_SYSENTER_EIP, data); 2082 break; 2083 case MSR_IA32_SYSENTER_ESP: 2084 if (is_guest_mode(vcpu)) { 2085 data = nested_vmx_truncate_sysenter_addr(vcpu, data); 2086 get_vmcs12(vcpu)->guest_sysenter_esp = data; 2087 } 2088 vmcs_writel(GUEST_SYSENTER_ESP, data); 2089 break; 2090 case MSR_IA32_DEBUGCTLMSR: { 2091 u64 invalid = data & ~vcpu_supported_debugctl(vcpu); 2092 if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) { 2093 if (report_ignored_msrs) 2094 vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n", 2095 __func__, data); 2096 data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2097 invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR); 2098 } 2099 2100 if (invalid) 2101 return 1; 2102 2103 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 2104 VM_EXIT_SAVE_DEBUG_CONTROLS) 2105 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 2106 2107 vmcs_write64(GUEST_IA32_DEBUGCTL, data); 2108 if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event && 2109 (data & DEBUGCTLMSR_LBR)) 2110 intel_pmu_create_guest_lbr_event(vcpu); 2111 return 0; 2112 } 2113 case MSR_IA32_BNDCFGS: 2114 if (!kvm_mpx_supported() || 2115 (!msr_info->host_initiated && 2116 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 2117 return 1; 2118 if (is_noncanonical_address(data & PAGE_MASK, vcpu) || 2119 (data & MSR_IA32_BNDCFGS_RSVD)) 2120 return 1; 2121 2122 if (is_guest_mode(vcpu) && 2123 ((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) || 2124 (vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS))) 2125 get_vmcs12(vcpu)->guest_bndcfgs = data; 2126 2127 vmcs_write64(GUEST_BNDCFGS, data); 2128 break; 2129 case MSR_IA32_UMWAIT_CONTROL: 2130 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 2131 return 1; 2132 2133 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 2134 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 2135 return 1; 2136 2137 vmx->msr_ia32_umwait_control = data; 2138 break; 2139 case MSR_IA32_SPEC_CTRL: 2140 if (!msr_info->host_initiated && 2141 !guest_has_spec_ctrl_msr(vcpu)) 2142 return 1; 2143 2144 if (kvm_spec_ctrl_test_value(data)) 2145 return 1; 2146 2147 vmx->spec_ctrl = data; 2148 if (!data) 2149 break; 2150 2151 /* 2152 * For non-nested: 2153 * When it's written (to non-zero) for the first time, pass 2154 * it through. 2155 * 2156 * For nested: 2157 * The handling of the MSR bitmap for L2 guests is done in 2158 * nested_vmx_prepare_msr_bitmap. We should not touch the 2159 * vmcs02.msr_bitmap here since it gets completely overwritten 2160 * in the merging. We update the vmcs01 here for L1 as well 2161 * since it will end up touching the MSR anyway now. 2162 */ 2163 vmx_disable_intercept_for_msr(vcpu, 2164 MSR_IA32_SPEC_CTRL, 2165 MSR_TYPE_RW); 2166 break; 2167 case MSR_IA32_TSX_CTRL: 2168 if (!msr_info->host_initiated && 2169 !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) 2170 return 1; 2171 if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) 2172 return 1; 2173 goto find_uret_msr; 2174 case MSR_IA32_PRED_CMD: 2175 if (!msr_info->host_initiated && 2176 !guest_has_pred_cmd_msr(vcpu)) 2177 return 1; 2178 2179 if (data & ~PRED_CMD_IBPB) 2180 return 1; 2181 if (!boot_cpu_has(X86_FEATURE_IBPB)) 2182 return 1; 2183 if (!data) 2184 break; 2185 2186 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); 2187 2188 /* 2189 * For non-nested: 2190 * When it's written (to non-zero) for the first time, pass 2191 * it through. 2192 * 2193 * For nested: 2194 * The handling of the MSR bitmap for L2 guests is done in 2195 * nested_vmx_prepare_msr_bitmap. We should not touch the 2196 * vmcs02.msr_bitmap here since it gets completely overwritten 2197 * in the merging. 2198 */ 2199 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); 2200 break; 2201 case MSR_IA32_CR_PAT: 2202 if (!kvm_pat_valid(data)) 2203 return 1; 2204 2205 if (is_guest_mode(vcpu) && 2206 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2207 get_vmcs12(vcpu)->guest_ia32_pat = data; 2208 2209 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2210 vmcs_write64(GUEST_IA32_PAT, data); 2211 vcpu->arch.pat = data; 2212 break; 2213 } 2214 ret = kvm_set_msr_common(vcpu, msr_info); 2215 break; 2216 case MSR_IA32_MCG_EXT_CTL: 2217 if ((!msr_info->host_initiated && 2218 !(to_vmx(vcpu)->msr_ia32_feature_control & 2219 FEAT_CTL_LMCE_ENABLED)) || 2220 (data & ~MCG_EXT_CTL_LMCE_EN)) 2221 return 1; 2222 vcpu->arch.mcg_ext_ctl = data; 2223 break; 2224 case MSR_IA32_FEAT_CTL: 2225 if (!vmx_feature_control_msr_valid(vcpu, data) || 2226 (to_vmx(vcpu)->msr_ia32_feature_control & 2227 FEAT_CTL_LOCKED && !msr_info->host_initiated)) 2228 return 1; 2229 vmx->msr_ia32_feature_control = data; 2230 if (msr_info->host_initiated && data == 0) 2231 vmx_leave_nested(vcpu); 2232 2233 /* SGX may be enabled/disabled by guest's firmware */ 2234 vmx_write_encls_bitmap(vcpu, NULL); 2235 break; 2236 case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: 2237 /* 2238 * On real hardware, the LE hash MSRs are writable before 2239 * the firmware sets bit 0 in MSR 0x7a ("activating" SGX), 2240 * at which point SGX related bits in IA32_FEATURE_CONTROL 2241 * become writable. 2242 * 2243 * KVM does not emulate SGX activation for simplicity, so 2244 * allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL 2245 * is unlocked. This is technically not architectural 2246 * behavior, but it's close enough. 2247 */ 2248 if (!msr_info->host_initiated && 2249 (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || 2250 ((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) && 2251 !(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED)))) 2252 return 1; 2253 vmx->msr_ia32_sgxlepubkeyhash 2254 [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; 2255 break; 2256 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2257 if (!msr_info->host_initiated) 2258 return 1; /* they are read-only */ 2259 if (!nested_vmx_allowed(vcpu)) 2260 return 1; 2261 return vmx_set_vmx_msr(vcpu, msr_index, data); 2262 case MSR_IA32_RTIT_CTL: 2263 if (!vmx_pt_mode_is_host_guest() || 2264 vmx_rtit_ctl_check(vcpu, data) || 2265 vmx->nested.vmxon) 2266 return 1; 2267 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2268 vmx->pt_desc.guest.ctl = data; 2269 pt_update_intercept_for_msr(vcpu); 2270 break; 2271 case MSR_IA32_RTIT_STATUS: 2272 if (!pt_can_write_msr(vmx)) 2273 return 1; 2274 if (data & MSR_IA32_RTIT_STATUS_MASK) 2275 return 1; 2276 vmx->pt_desc.guest.status = data; 2277 break; 2278 case MSR_IA32_RTIT_CR3_MATCH: 2279 if (!pt_can_write_msr(vmx)) 2280 return 1; 2281 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2282 PT_CAP_cr3_filtering)) 2283 return 1; 2284 vmx->pt_desc.guest.cr3_match = data; 2285 break; 2286 case MSR_IA32_RTIT_OUTPUT_BASE: 2287 if (!pt_can_write_msr(vmx)) 2288 return 1; 2289 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2290 PT_CAP_topa_output) && 2291 !intel_pt_validate_cap(vmx->pt_desc.caps, 2292 PT_CAP_single_range_output)) 2293 return 1; 2294 if (!pt_output_base_valid(vcpu, data)) 2295 return 1; 2296 vmx->pt_desc.guest.output_base = data; 2297 break; 2298 case MSR_IA32_RTIT_OUTPUT_MASK: 2299 if (!pt_can_write_msr(vmx)) 2300 return 1; 2301 if (!intel_pt_validate_cap(vmx->pt_desc.caps, 2302 PT_CAP_topa_output) && 2303 !intel_pt_validate_cap(vmx->pt_desc.caps, 2304 PT_CAP_single_range_output)) 2305 return 1; 2306 vmx->pt_desc.guest.output_mask = data; 2307 break; 2308 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2309 if (!pt_can_write_msr(vmx)) 2310 return 1; 2311 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2312 if (index >= 2 * vmx->pt_desc.num_address_ranges) 2313 return 1; 2314 if (is_noncanonical_address(data, vcpu)) 2315 return 1; 2316 if (index % 2) 2317 vmx->pt_desc.guest.addr_b[index / 2] = data; 2318 else 2319 vmx->pt_desc.guest.addr_a[index / 2] = data; 2320 break; 2321 case MSR_IA32_PERF_CAPABILITIES: 2322 if (data && !vcpu_to_pmu(vcpu)->version) 2323 return 1; 2324 if (data & PMU_CAP_LBR_FMT) { 2325 if ((data & PMU_CAP_LBR_FMT) != 2326 (vmx_get_perf_capabilities() & PMU_CAP_LBR_FMT)) 2327 return 1; 2328 if (!cpuid_model_is_consistent(vcpu)) 2329 return 1; 2330 } 2331 if (data & PERF_CAP_PEBS_FORMAT) { 2332 if ((data & PERF_CAP_PEBS_MASK) != 2333 (vmx_get_perf_capabilities() & PERF_CAP_PEBS_MASK)) 2334 return 1; 2335 if (!guest_cpuid_has(vcpu, X86_FEATURE_DS)) 2336 return 1; 2337 if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64)) 2338 return 1; 2339 if (!cpuid_model_is_consistent(vcpu)) 2340 return 1; 2341 } 2342 ret = kvm_set_msr_common(vcpu, msr_info); 2343 break; 2344 2345 default: 2346 find_uret_msr: 2347 msr = vmx_find_uret_msr(vmx, msr_index); 2348 if (msr) 2349 ret = vmx_set_guest_uret_msr(vmx, msr, data); 2350 else 2351 ret = kvm_set_msr_common(vcpu, msr_info); 2352 } 2353 2354 /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */ 2355 if (msr_index == MSR_IA32_ARCH_CAPABILITIES) 2356 vmx_update_fb_clear_dis(vcpu, vmx); 2357 2358 return ret; 2359 } 2360 2361 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2362 { 2363 unsigned long guest_owned_bits; 2364 2365 kvm_register_mark_available(vcpu, reg); 2366 2367 switch (reg) { 2368 case VCPU_REGS_RSP: 2369 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2370 break; 2371 case VCPU_REGS_RIP: 2372 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2373 break; 2374 case VCPU_EXREG_PDPTR: 2375 if (enable_ept) 2376 ept_save_pdptrs(vcpu); 2377 break; 2378 case VCPU_EXREG_CR0: 2379 guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2380 2381 vcpu->arch.cr0 &= ~guest_owned_bits; 2382 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; 2383 break; 2384 case VCPU_EXREG_CR3: 2385 /* 2386 * When intercepting CR3 loads, e.g. for shadowing paging, KVM's 2387 * CR3 is loaded into hardware, not the guest's CR3. 2388 */ 2389 if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) 2390 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2391 break; 2392 case VCPU_EXREG_CR4: 2393 guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2394 2395 vcpu->arch.cr4 &= ~guest_owned_bits; 2396 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; 2397 break; 2398 default: 2399 KVM_BUG_ON(1, vcpu->kvm); 2400 break; 2401 } 2402 } 2403 2404 static __init int cpu_has_kvm_support(void) 2405 { 2406 return cpu_has_vmx(); 2407 } 2408 2409 static __init int vmx_disabled_by_bios(void) 2410 { 2411 return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 2412 !boot_cpu_has(X86_FEATURE_VMX); 2413 } 2414 2415 static int kvm_cpu_vmxon(u64 vmxon_pointer) 2416 { 2417 u64 msr; 2418 2419 cr4_set_bits(X86_CR4_VMXE); 2420 2421 asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" 2422 _ASM_EXTABLE(1b, %l[fault]) 2423 : : [vmxon_pointer] "m"(vmxon_pointer) 2424 : : fault); 2425 return 0; 2426 2427 fault: 2428 WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", 2429 rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); 2430 cr4_clear_bits(X86_CR4_VMXE); 2431 2432 return -EFAULT; 2433 } 2434 2435 static int vmx_hardware_enable(void) 2436 { 2437 int cpu = raw_smp_processor_id(); 2438 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2439 int r; 2440 2441 if (cr4_read_shadow() & X86_CR4_VMXE) 2442 return -EBUSY; 2443 2444 /* 2445 * This can happen if we hot-added a CPU but failed to allocate 2446 * VP assist page for it. 2447 */ 2448 if (static_branch_unlikely(&enable_evmcs) && 2449 !hv_get_vp_assist_page(cpu)) 2450 return -EFAULT; 2451 2452 intel_pt_handle_vmx(1); 2453 2454 r = kvm_cpu_vmxon(phys_addr); 2455 if (r) { 2456 intel_pt_handle_vmx(0); 2457 return r; 2458 } 2459 2460 if (enable_ept) 2461 ept_sync_global(); 2462 2463 return 0; 2464 } 2465 2466 static void vmclear_local_loaded_vmcss(void) 2467 { 2468 int cpu = raw_smp_processor_id(); 2469 struct loaded_vmcs *v, *n; 2470 2471 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2472 loaded_vmcss_on_cpu_link) 2473 __loaded_vmcs_clear(v); 2474 } 2475 2476 static void vmx_hardware_disable(void) 2477 { 2478 vmclear_local_loaded_vmcss(); 2479 2480 if (cpu_vmxoff()) 2481 kvm_spurious_fault(); 2482 2483 intel_pt_handle_vmx(0); 2484 } 2485 2486 /* 2487 * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID 2488 * directly instead of going through cpu_has(), to ensure KVM is trapping 2489 * ENCLS whenever it's supported in hardware. It does not matter whether 2490 * the host OS supports or has enabled SGX. 2491 */ 2492 static bool cpu_has_sgx(void) 2493 { 2494 return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0)); 2495 } 2496 2497 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 2498 u32 msr, u32 *result) 2499 { 2500 u32 vmx_msr_low, vmx_msr_high; 2501 u32 ctl = ctl_min | ctl_opt; 2502 2503 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2504 2505 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2506 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2507 2508 /* Ensure minimum (required) set of control bits are supported. */ 2509 if (ctl_min & ~ctl) 2510 return -EIO; 2511 2512 *result = ctl; 2513 return 0; 2514 } 2515 2516 static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr) 2517 { 2518 u64 allowed; 2519 2520 rdmsrl(msr, allowed); 2521 2522 return ctl_opt & allowed; 2523 } 2524 2525 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2526 struct vmx_capability *vmx_cap) 2527 { 2528 u32 vmx_msr_low, vmx_msr_high; 2529 u32 min, opt, min2, opt2; 2530 u32 _pin_based_exec_control = 0; 2531 u32 _cpu_based_exec_control = 0; 2532 u32 _cpu_based_2nd_exec_control = 0; 2533 u64 _cpu_based_3rd_exec_control = 0; 2534 u32 _vmexit_control = 0; 2535 u32 _vmentry_control = 0; 2536 int i; 2537 2538 /* 2539 * LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory. 2540 * SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always 2541 * intercepts writes to PAT and EFER, i.e. never enables those controls. 2542 */ 2543 struct { 2544 u32 entry_control; 2545 u32 exit_control; 2546 } const vmcs_entry_exit_pairs[] = { 2547 { VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL }, 2548 { VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT }, 2549 { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, 2550 { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, 2551 { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, 2552 }; 2553 2554 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2555 min = CPU_BASED_HLT_EXITING | 2556 #ifdef CONFIG_X86_64 2557 CPU_BASED_CR8_LOAD_EXITING | 2558 CPU_BASED_CR8_STORE_EXITING | 2559 #endif 2560 CPU_BASED_CR3_LOAD_EXITING | 2561 CPU_BASED_CR3_STORE_EXITING | 2562 CPU_BASED_UNCOND_IO_EXITING | 2563 CPU_BASED_MOV_DR_EXITING | 2564 CPU_BASED_USE_TSC_OFFSETTING | 2565 CPU_BASED_MWAIT_EXITING | 2566 CPU_BASED_MONITOR_EXITING | 2567 CPU_BASED_INVLPG_EXITING | 2568 CPU_BASED_RDPMC_EXITING; 2569 2570 opt = CPU_BASED_TPR_SHADOW | 2571 CPU_BASED_USE_MSR_BITMAPS | 2572 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | 2573 CPU_BASED_ACTIVATE_TERTIARY_CONTROLS; 2574 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 2575 &_cpu_based_exec_control) < 0) 2576 return -EIO; 2577 #ifdef CONFIG_X86_64 2578 if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) 2579 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 2580 ~CPU_BASED_CR8_STORE_EXITING; 2581 #endif 2582 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2583 min2 = 0; 2584 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2585 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2586 SECONDARY_EXEC_WBINVD_EXITING | 2587 SECONDARY_EXEC_ENABLE_VPID | 2588 SECONDARY_EXEC_ENABLE_EPT | 2589 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2590 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2591 SECONDARY_EXEC_DESC | 2592 SECONDARY_EXEC_ENABLE_RDTSCP | 2593 SECONDARY_EXEC_ENABLE_INVPCID | 2594 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2595 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2596 SECONDARY_EXEC_SHADOW_VMCS | 2597 SECONDARY_EXEC_XSAVES | 2598 SECONDARY_EXEC_RDSEED_EXITING | 2599 SECONDARY_EXEC_RDRAND_EXITING | 2600 SECONDARY_EXEC_ENABLE_PML | 2601 SECONDARY_EXEC_TSC_SCALING | 2602 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2603 SECONDARY_EXEC_PT_USE_GPA | 2604 SECONDARY_EXEC_PT_CONCEAL_VMX | 2605 SECONDARY_EXEC_ENABLE_VMFUNC | 2606 SECONDARY_EXEC_BUS_LOCK_DETECTION | 2607 SECONDARY_EXEC_NOTIFY_VM_EXITING; 2608 if (cpu_has_sgx()) 2609 opt2 |= SECONDARY_EXEC_ENCLS_EXITING; 2610 if (adjust_vmx_controls(min2, opt2, 2611 MSR_IA32_VMX_PROCBASED_CTLS2, 2612 &_cpu_based_2nd_exec_control) < 0) 2613 return -EIO; 2614 } 2615 #ifndef CONFIG_X86_64 2616 if (!(_cpu_based_2nd_exec_control & 2617 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2618 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2619 #endif 2620 2621 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2622 _cpu_based_2nd_exec_control &= ~( 2623 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2624 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2625 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2626 2627 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2628 &vmx_cap->ept, &vmx_cap->vpid); 2629 2630 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 2631 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 2632 enabled */ 2633 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 2634 CPU_BASED_CR3_STORE_EXITING | 2635 CPU_BASED_INVLPG_EXITING); 2636 } else if (vmx_cap->ept) { 2637 pr_warn_once("EPT CAP should not exist if not support " 2638 "1-setting enable EPT VM-execution control\n"); 2639 2640 if (error_on_inconsistent_vmcs_config) 2641 return -EIO; 2642 2643 vmx_cap->ept = 0; 2644 } 2645 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2646 vmx_cap->vpid) { 2647 pr_warn_once("VPID CAP should not exist if not support " 2648 "1-setting enable VPID VM-execution control\n"); 2649 2650 if (error_on_inconsistent_vmcs_config) 2651 return -EIO; 2652 2653 vmx_cap->vpid = 0; 2654 } 2655 2656 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) { 2657 u64 opt3 = TERTIARY_EXEC_IPI_VIRT; 2658 2659 _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3, 2660 MSR_IA32_VMX_PROCBASED_CTLS3); 2661 } 2662 2663 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; 2664 #ifdef CONFIG_X86_64 2665 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 2666 #endif 2667 opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | 2668 VM_EXIT_LOAD_IA32_PAT | 2669 VM_EXIT_LOAD_IA32_EFER | 2670 VM_EXIT_CLEAR_BNDCFGS | 2671 VM_EXIT_PT_CONCEAL_PIP | 2672 VM_EXIT_CLEAR_IA32_RTIT_CTL; 2673 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 2674 &_vmexit_control) < 0) 2675 return -EIO; 2676 2677 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 2678 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | 2679 PIN_BASED_VMX_PREEMPTION_TIMER; 2680 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 2681 &_pin_based_exec_control) < 0) 2682 return -EIO; 2683 2684 if (cpu_has_broken_vmx_preemption_timer()) 2685 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2686 if (!(_cpu_based_2nd_exec_control & 2687 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2688 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2689 2690 min = VM_ENTRY_LOAD_DEBUG_CONTROLS; 2691 opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 2692 VM_ENTRY_LOAD_IA32_PAT | 2693 VM_ENTRY_LOAD_IA32_EFER | 2694 VM_ENTRY_LOAD_BNDCFGS | 2695 VM_ENTRY_PT_CONCEAL_PIP | 2696 VM_ENTRY_LOAD_IA32_RTIT_CTL; 2697 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 2698 &_vmentry_control) < 0) 2699 return -EIO; 2700 2701 for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) { 2702 u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control; 2703 u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control; 2704 2705 if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl)) 2706 continue; 2707 2708 pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n", 2709 _vmentry_control & n_ctrl, _vmexit_control & x_ctrl); 2710 2711 if (error_on_inconsistent_vmcs_config) 2712 return -EIO; 2713 2714 _vmentry_control &= ~n_ctrl; 2715 _vmexit_control &= ~x_ctrl; 2716 } 2717 2718 /* 2719 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2720 * can't be used due to an errata where VM Exit may incorrectly clear 2721 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2722 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2723 */ 2724 if (boot_cpu_data.x86 == 0x6) { 2725 switch (boot_cpu_data.x86_model) { 2726 case 26: /* AAK155 */ 2727 case 30: /* AAP115 */ 2728 case 37: /* AAT100 */ 2729 case 44: /* BC86,AAY89,BD102 */ 2730 case 46: /* BA97 */ 2731 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2732 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2733 pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2734 "does not work properly. Using workaround\n"); 2735 break; 2736 default: 2737 break; 2738 } 2739 } 2740 2741 2742 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 2743 2744 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2745 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 2746 return -EIO; 2747 2748 #ifdef CONFIG_X86_64 2749 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 2750 if (vmx_msr_high & (1u<<16)) 2751 return -EIO; 2752 #endif 2753 2754 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2755 if (((vmx_msr_high >> 18) & 15) != 6) 2756 return -EIO; 2757 2758 vmcs_conf->size = vmx_msr_high & 0x1fff; 2759 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; 2760 2761 vmcs_conf->revision_id = vmx_msr_low; 2762 2763 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2764 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2765 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2766 vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control; 2767 vmcs_conf->vmexit_ctrl = _vmexit_control; 2768 vmcs_conf->vmentry_ctrl = _vmentry_control; 2769 2770 #if IS_ENABLED(CONFIG_HYPERV) 2771 if (enlightened_vmcs) 2772 evmcs_sanitize_exec_ctrls(vmcs_conf); 2773 #endif 2774 2775 return 0; 2776 } 2777 2778 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2779 { 2780 int node = cpu_to_node(cpu); 2781 struct page *pages; 2782 struct vmcs *vmcs; 2783 2784 pages = __alloc_pages_node(node, flags, 0); 2785 if (!pages) 2786 return NULL; 2787 vmcs = page_address(pages); 2788 memset(vmcs, 0, vmcs_config.size); 2789 2790 /* KVM supports Enlightened VMCS v1 only */ 2791 if (static_branch_unlikely(&enable_evmcs)) 2792 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2793 else 2794 vmcs->hdr.revision_id = vmcs_config.revision_id; 2795 2796 if (shadow) 2797 vmcs->hdr.shadow_vmcs = 1; 2798 return vmcs; 2799 } 2800 2801 void free_vmcs(struct vmcs *vmcs) 2802 { 2803 free_page((unsigned long)vmcs); 2804 } 2805 2806 /* 2807 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2808 */ 2809 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2810 { 2811 if (!loaded_vmcs->vmcs) 2812 return; 2813 loaded_vmcs_clear(loaded_vmcs); 2814 free_vmcs(loaded_vmcs->vmcs); 2815 loaded_vmcs->vmcs = NULL; 2816 if (loaded_vmcs->msr_bitmap) 2817 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2818 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2819 } 2820 2821 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2822 { 2823 loaded_vmcs->vmcs = alloc_vmcs(false); 2824 if (!loaded_vmcs->vmcs) 2825 return -ENOMEM; 2826 2827 vmcs_clear(loaded_vmcs->vmcs); 2828 2829 loaded_vmcs->shadow_vmcs = NULL; 2830 loaded_vmcs->hv_timer_soft_disabled = false; 2831 loaded_vmcs->cpu = -1; 2832 loaded_vmcs->launched = 0; 2833 2834 if (cpu_has_vmx_msr_bitmap()) { 2835 loaded_vmcs->msr_bitmap = (unsigned long *) 2836 __get_free_page(GFP_KERNEL_ACCOUNT); 2837 if (!loaded_vmcs->msr_bitmap) 2838 goto out_vmcs; 2839 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2840 } 2841 2842 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2843 memset(&loaded_vmcs->controls_shadow, 0, 2844 sizeof(struct vmcs_controls_shadow)); 2845 2846 return 0; 2847 2848 out_vmcs: 2849 free_loaded_vmcs(loaded_vmcs); 2850 return -ENOMEM; 2851 } 2852 2853 static void free_kvm_area(void) 2854 { 2855 int cpu; 2856 2857 for_each_possible_cpu(cpu) { 2858 free_vmcs(per_cpu(vmxarea, cpu)); 2859 per_cpu(vmxarea, cpu) = NULL; 2860 } 2861 } 2862 2863 static __init int alloc_kvm_area(void) 2864 { 2865 int cpu; 2866 2867 for_each_possible_cpu(cpu) { 2868 struct vmcs *vmcs; 2869 2870 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 2871 if (!vmcs) { 2872 free_kvm_area(); 2873 return -ENOMEM; 2874 } 2875 2876 /* 2877 * When eVMCS is enabled, alloc_vmcs_cpu() sets 2878 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 2879 * revision_id reported by MSR_IA32_VMX_BASIC. 2880 * 2881 * However, even though not explicitly documented by 2882 * TLFS, VMXArea passed as VMXON argument should 2883 * still be marked with revision_id reported by 2884 * physical CPU. 2885 */ 2886 if (static_branch_unlikely(&enable_evmcs)) 2887 vmcs->hdr.revision_id = vmcs_config.revision_id; 2888 2889 per_cpu(vmxarea, cpu) = vmcs; 2890 } 2891 return 0; 2892 } 2893 2894 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 2895 struct kvm_segment *save) 2896 { 2897 if (!emulate_invalid_guest_state) { 2898 /* 2899 * CS and SS RPL should be equal during guest entry according 2900 * to VMX spec, but in reality it is not always so. Since vcpu 2901 * is in the middle of the transition from real mode to 2902 * protected mode it is safe to assume that RPL 0 is a good 2903 * default value. 2904 */ 2905 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 2906 save->selector &= ~SEGMENT_RPL_MASK; 2907 save->dpl = save->selector & SEGMENT_RPL_MASK; 2908 save->s = 1; 2909 } 2910 __vmx_set_segment(vcpu, save, seg); 2911 } 2912 2913 static void enter_pmode(struct kvm_vcpu *vcpu) 2914 { 2915 unsigned long flags; 2916 struct vcpu_vmx *vmx = to_vmx(vcpu); 2917 2918 /* 2919 * Update real mode segment cache. It may be not up-to-date if segment 2920 * register was written while vcpu was in a guest mode. 2921 */ 2922 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2923 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 2924 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 2925 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 2926 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 2927 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 2928 2929 vmx->rmode.vm86_active = 0; 2930 2931 __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2932 2933 flags = vmcs_readl(GUEST_RFLAGS); 2934 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 2935 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 2936 vmcs_writel(GUEST_RFLAGS, flags); 2937 2938 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 2939 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 2940 2941 vmx_update_exception_bitmap(vcpu); 2942 2943 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 2944 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 2945 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 2946 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 2947 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 2948 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 2949 } 2950 2951 static void fix_rmode_seg(int seg, struct kvm_segment *save) 2952 { 2953 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2954 struct kvm_segment var = *save; 2955 2956 var.dpl = 0x3; 2957 if (seg == VCPU_SREG_CS) 2958 var.type = 0x3; 2959 2960 if (!emulate_invalid_guest_state) { 2961 var.selector = var.base >> 4; 2962 var.base = var.base & 0xffff0; 2963 var.limit = 0xffff; 2964 var.g = 0; 2965 var.db = 0; 2966 var.present = 1; 2967 var.s = 1; 2968 var.l = 0; 2969 var.unusable = 0; 2970 var.type = 0x3; 2971 var.avl = 0; 2972 if (save->base & 0xf) 2973 printk_once(KERN_WARNING "kvm: segment base is not " 2974 "paragraph aligned when entering " 2975 "protected mode (seg=%d)", seg); 2976 } 2977 2978 vmcs_write16(sf->selector, var.selector); 2979 vmcs_writel(sf->base, var.base); 2980 vmcs_write32(sf->limit, var.limit); 2981 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 2982 } 2983 2984 static void enter_rmode(struct kvm_vcpu *vcpu) 2985 { 2986 unsigned long flags; 2987 struct vcpu_vmx *vmx = to_vmx(vcpu); 2988 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 2989 2990 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2991 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2992 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 2993 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 2994 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 2995 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 2996 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 2997 2998 vmx->rmode.vm86_active = 1; 2999 3000 /* 3001 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 3002 * vcpu. Warn the user that an update is overdue. 3003 */ 3004 if (!kvm_vmx->tss_addr) 3005 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 3006 "called before entering vcpu\n"); 3007 3008 vmx_segment_cache_clear(vmx); 3009 3010 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 3011 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3012 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3013 3014 flags = vmcs_readl(GUEST_RFLAGS); 3015 vmx->rmode.save_rflags = flags; 3016 3017 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3018 3019 vmcs_writel(GUEST_RFLAGS, flags); 3020 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3021 vmx_update_exception_bitmap(vcpu); 3022 3023 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3024 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3025 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3026 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3027 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3028 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3029 } 3030 3031 int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3032 { 3033 struct vcpu_vmx *vmx = to_vmx(vcpu); 3034 3035 /* Nothing to do if hardware doesn't support EFER. */ 3036 if (!vmx_find_uret_msr(vmx, MSR_EFER)) 3037 return 0; 3038 3039 vcpu->arch.efer = efer; 3040 if (efer & EFER_LMA) 3041 vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE); 3042 else 3043 vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE); 3044 3045 vmx_setup_uret_msrs(vmx); 3046 return 0; 3047 } 3048 3049 #ifdef CONFIG_X86_64 3050 3051 static void enter_lmode(struct kvm_vcpu *vcpu) 3052 { 3053 u32 guest_tr_ar; 3054 3055 vmx_segment_cache_clear(to_vmx(vcpu)); 3056 3057 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3058 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 3059 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3060 __func__); 3061 vmcs_write32(GUEST_TR_AR_BYTES, 3062 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 3063 | VMX_AR_TYPE_BUSY_64_TSS); 3064 } 3065 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3066 } 3067 3068 static void exit_lmode(struct kvm_vcpu *vcpu) 3069 { 3070 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3071 } 3072 3073 #endif 3074 3075 static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) 3076 { 3077 struct vcpu_vmx *vmx = to_vmx(vcpu); 3078 3079 /* 3080 * INVEPT must be issued when EPT is enabled, irrespective of VPID, as 3081 * the CPU is not required to invalidate guest-physical mappings on 3082 * VM-Entry, even if VPID is disabled. Guest-physical mappings are 3083 * associated with the root EPT structure and not any particular VPID 3084 * (INVVPID also isn't required to invalidate guest-physical mappings). 3085 */ 3086 if (enable_ept) { 3087 ept_sync_global(); 3088 } else if (enable_vpid) { 3089 if (cpu_has_vmx_invvpid_global()) { 3090 vpid_sync_vcpu_global(); 3091 } else { 3092 vpid_sync_vcpu_single(vmx->vpid); 3093 vpid_sync_vcpu_single(vmx->nested.vpid02); 3094 } 3095 } 3096 } 3097 3098 static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) 3099 { 3100 if (is_guest_mode(vcpu)) 3101 return nested_get_vpid02(vcpu); 3102 return to_vmx(vcpu)->vpid; 3103 } 3104 3105 static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) 3106 { 3107 struct kvm_mmu *mmu = vcpu->arch.mmu; 3108 u64 root_hpa = mmu->root.hpa; 3109 3110 /* No flush required if the current context is invalid. */ 3111 if (!VALID_PAGE(root_hpa)) 3112 return; 3113 3114 if (enable_ept) 3115 ept_sync_context(construct_eptp(vcpu, root_hpa, 3116 mmu->root_role.level)); 3117 else 3118 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3119 } 3120 3121 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 3122 { 3123 /* 3124 * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in 3125 * vmx_flush_tlb_guest() for an explanation of why this is ok. 3126 */ 3127 vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); 3128 } 3129 3130 static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) 3131 { 3132 /* 3133 * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a 3134 * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are 3135 * required to flush GVA->{G,H}PA mappings from the TLB if vpid is 3136 * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), 3137 * i.e. no explicit INVVPID is necessary. 3138 */ 3139 vpid_sync_context(vmx_get_current_vpid(vcpu)); 3140 } 3141 3142 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) 3143 { 3144 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3145 3146 if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) 3147 return; 3148 3149 if (is_pae_paging(vcpu)) { 3150 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3151 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3152 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3153 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3154 } 3155 } 3156 3157 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3158 { 3159 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3160 3161 if (WARN_ON_ONCE(!is_pae_paging(vcpu))) 3162 return; 3163 3164 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3165 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3166 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3167 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3168 3169 kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR); 3170 } 3171 3172 #define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ 3173 CPU_BASED_CR3_STORE_EXITING) 3174 3175 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3176 { 3177 struct vcpu_vmx *vmx = to_vmx(vcpu); 3178 unsigned long hw_cr0, old_cr0_pg; 3179 u32 tmp; 3180 3181 old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); 3182 3183 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 3184 if (is_unrestricted_guest(vcpu)) 3185 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3186 else { 3187 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3188 if (!enable_ept) 3189 hw_cr0 |= X86_CR0_WP; 3190 3191 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3192 enter_pmode(vcpu); 3193 3194 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3195 enter_rmode(vcpu); 3196 } 3197 3198 vmcs_writel(CR0_READ_SHADOW, cr0); 3199 vmcs_writel(GUEST_CR0, hw_cr0); 3200 vcpu->arch.cr0 = cr0; 3201 kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); 3202 3203 #ifdef CONFIG_X86_64 3204 if (vcpu->arch.efer & EFER_LME) { 3205 if (!old_cr0_pg && (cr0 & X86_CR0_PG)) 3206 enter_lmode(vcpu); 3207 else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) 3208 exit_lmode(vcpu); 3209 } 3210 #endif 3211 3212 if (enable_ept && !is_unrestricted_guest(vcpu)) { 3213 /* 3214 * Ensure KVM has an up-to-date snapshot of the guest's CR3. If 3215 * the below code _enables_ CR3 exiting, vmx_cache_reg() will 3216 * (correctly) stop reading vmcs.GUEST_CR3 because it thinks 3217 * KVM's CR3 is installed. 3218 */ 3219 if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) 3220 vmx_cache_reg(vcpu, VCPU_EXREG_CR3); 3221 3222 /* 3223 * When running with EPT but not unrestricted guest, KVM must 3224 * intercept CR3 accesses when paging is _disabled_. This is 3225 * necessary because restricted guests can't actually run with 3226 * paging disabled, and so KVM stuffs its own CR3 in order to 3227 * run the guest when identity mapped page tables. 3228 * 3229 * Do _NOT_ check the old CR0.PG, e.g. to optimize away the 3230 * update, it may be stale with respect to CR3 interception, 3231 * e.g. after nested VM-Enter. 3232 * 3233 * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or 3234 * stores to forward them to L1, even if KVM does not need to 3235 * intercept them to preserve its identity mapped page tables. 3236 */ 3237 if (!(cr0 & X86_CR0_PG)) { 3238 exec_controls_setbit(vmx, CR3_EXITING_BITS); 3239 } else if (!is_guest_mode(vcpu)) { 3240 exec_controls_clearbit(vmx, CR3_EXITING_BITS); 3241 } else { 3242 tmp = exec_controls_get(vmx); 3243 tmp &= ~CR3_EXITING_BITS; 3244 tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; 3245 exec_controls_set(vmx, tmp); 3246 } 3247 3248 /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ 3249 if ((old_cr0_pg ^ cr0) & X86_CR0_PG) 3250 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3251 3252 /* 3253 * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but 3254 * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG. 3255 */ 3256 if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG)) 3257 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 3258 } 3259 3260 /* depends on vcpu->arch.cr0 to be set to a new value */ 3261 vmx->emulation_required = vmx_emulation_required(vcpu); 3262 } 3263 3264 static int vmx_get_max_tdp_level(void) 3265 { 3266 if (cpu_has_vmx_ept_5levels()) 3267 return 5; 3268 return 4; 3269 } 3270 3271 u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level) 3272 { 3273 u64 eptp = VMX_EPTP_MT_WB; 3274 3275 eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 3276 3277 if (enable_ept_ad_bits && 3278 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 3279 eptp |= VMX_EPTP_AD_ENABLE_BIT; 3280 eptp |= root_hpa; 3281 3282 return eptp; 3283 } 3284 3285 static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, 3286 int root_level) 3287 { 3288 struct kvm *kvm = vcpu->kvm; 3289 bool update_guest_cr3 = true; 3290 unsigned long guest_cr3; 3291 u64 eptp; 3292 3293 if (enable_ept) { 3294 eptp = construct_eptp(vcpu, root_hpa, root_level); 3295 vmcs_write64(EPT_POINTER, eptp); 3296 3297 hv_track_root_tdp(vcpu, root_hpa); 3298 3299 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3300 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3301 else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3)) 3302 guest_cr3 = vcpu->arch.cr3; 3303 else /* vmcs.GUEST_CR3 is already up-to-date. */ 3304 update_guest_cr3 = false; 3305 vmx_ept_load_pdptrs(vcpu); 3306 } else { 3307 guest_cr3 = root_hpa | kvm_get_active_pcid(vcpu); 3308 } 3309 3310 if (update_guest_cr3) 3311 vmcs_writel(GUEST_CR3, guest_cr3); 3312 } 3313 3314 3315 static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3316 { 3317 /* 3318 * We operate under the default treatment of SMM, so VMX cannot be 3319 * enabled under SMM. Note, whether or not VMXE is allowed at all, 3320 * i.e. is a reserved bit, is handled by common x86 code. 3321 */ 3322 if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu)) 3323 return false; 3324 3325 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3326 return false; 3327 3328 return true; 3329 } 3330 3331 void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3332 { 3333 unsigned long old_cr4 = vcpu->arch.cr4; 3334 struct vcpu_vmx *vmx = to_vmx(vcpu); 3335 /* 3336 * Pass through host's Machine Check Enable value to hw_cr4, which 3337 * is in force while we are in guest mode. Do not let guests control 3338 * this bit, even if host CR4.MCE == 0. 3339 */ 3340 unsigned long hw_cr4; 3341 3342 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3343 if (is_unrestricted_guest(vcpu)) 3344 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3345 else if (vmx->rmode.vm86_active) 3346 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3347 else 3348 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3349 3350 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { 3351 if (cr4 & X86_CR4_UMIP) { 3352 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3353 hw_cr4 &= ~X86_CR4_UMIP; 3354 } else if (!is_guest_mode(vcpu) || 3355 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3356 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3357 } 3358 } 3359 3360 vcpu->arch.cr4 = cr4; 3361 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); 3362 3363 if (!is_unrestricted_guest(vcpu)) { 3364 if (enable_ept) { 3365 if (!is_paging(vcpu)) { 3366 hw_cr4 &= ~X86_CR4_PAE; 3367 hw_cr4 |= X86_CR4_PSE; 3368 } else if (!(cr4 & X86_CR4_PAE)) { 3369 hw_cr4 &= ~X86_CR4_PAE; 3370 } 3371 } 3372 3373 /* 3374 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3375 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3376 * to be manually disabled when guest switches to non-paging 3377 * mode. 3378 * 3379 * If !enable_unrestricted_guest, the CPU is always running 3380 * with CR0.PG=1 and CR4 needs to be modified. 3381 * If enable_unrestricted_guest, the CPU automatically 3382 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3383 */ 3384 if (!is_paging(vcpu)) 3385 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3386 } 3387 3388 vmcs_writel(CR4_READ_SHADOW, cr4); 3389 vmcs_writel(GUEST_CR4, hw_cr4); 3390 3391 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 3392 kvm_update_cpuid_runtime(vcpu); 3393 } 3394 3395 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3396 { 3397 struct vcpu_vmx *vmx = to_vmx(vcpu); 3398 u32 ar; 3399 3400 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3401 *var = vmx->rmode.segs[seg]; 3402 if (seg == VCPU_SREG_TR 3403 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3404 return; 3405 var->base = vmx_read_guest_seg_base(vmx, seg); 3406 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3407 return; 3408 } 3409 var->base = vmx_read_guest_seg_base(vmx, seg); 3410 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3411 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3412 ar = vmx_read_guest_seg_ar(vmx, seg); 3413 var->unusable = (ar >> 16) & 1; 3414 var->type = ar & 15; 3415 var->s = (ar >> 4) & 1; 3416 var->dpl = (ar >> 5) & 3; 3417 /* 3418 * Some userspaces do not preserve unusable property. Since usable 3419 * segment has to be present according to VMX spec we can use present 3420 * property to amend userspace bug by making unusable segment always 3421 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3422 * segment as unusable. 3423 */ 3424 var->present = !var->unusable; 3425 var->avl = (ar >> 12) & 1; 3426 var->l = (ar >> 13) & 1; 3427 var->db = (ar >> 14) & 1; 3428 var->g = (ar >> 15) & 1; 3429 } 3430 3431 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3432 { 3433 struct kvm_segment s; 3434 3435 if (to_vmx(vcpu)->rmode.vm86_active) { 3436 vmx_get_segment(vcpu, &s, seg); 3437 return s.base; 3438 } 3439 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3440 } 3441 3442 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3443 { 3444 struct vcpu_vmx *vmx = to_vmx(vcpu); 3445 3446 if (unlikely(vmx->rmode.vm86_active)) 3447 return 0; 3448 else { 3449 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3450 return VMX_AR_DPL(ar); 3451 } 3452 } 3453 3454 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3455 { 3456 u32 ar; 3457 3458 if (var->unusable || !var->present) 3459 ar = 1 << 16; 3460 else { 3461 ar = var->type & 15; 3462 ar |= (var->s & 1) << 4; 3463 ar |= (var->dpl & 3) << 5; 3464 ar |= (var->present & 1) << 7; 3465 ar |= (var->avl & 1) << 12; 3466 ar |= (var->l & 1) << 13; 3467 ar |= (var->db & 1) << 14; 3468 ar |= (var->g & 1) << 15; 3469 } 3470 3471 return ar; 3472 } 3473 3474 void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3475 { 3476 struct vcpu_vmx *vmx = to_vmx(vcpu); 3477 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3478 3479 vmx_segment_cache_clear(vmx); 3480 3481 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3482 vmx->rmode.segs[seg] = *var; 3483 if (seg == VCPU_SREG_TR) 3484 vmcs_write16(sf->selector, var->selector); 3485 else if (var->s) 3486 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3487 return; 3488 } 3489 3490 vmcs_writel(sf->base, var->base); 3491 vmcs_write32(sf->limit, var->limit); 3492 vmcs_write16(sf->selector, var->selector); 3493 3494 /* 3495 * Fix the "Accessed" bit in AR field of segment registers for older 3496 * qemu binaries. 3497 * IA32 arch specifies that at the time of processor reset the 3498 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3499 * is setting it to 0 in the userland code. This causes invalid guest 3500 * state vmexit when "unrestricted guest" mode is turned on. 3501 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3502 * tree. Newer qemu binaries with that qemu fix would not need this 3503 * kvm hack. 3504 */ 3505 if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR)) 3506 var->type |= 0x1; /* Accessed */ 3507 3508 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3509 } 3510 3511 static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3512 { 3513 __vmx_set_segment(vcpu, var, seg); 3514 3515 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 3516 } 3517 3518 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3519 { 3520 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3521 3522 *db = (ar >> 14) & 1; 3523 *l = (ar >> 13) & 1; 3524 } 3525 3526 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3527 { 3528 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3529 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3530 } 3531 3532 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3533 { 3534 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3535 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3536 } 3537 3538 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3539 { 3540 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3541 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3542 } 3543 3544 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3545 { 3546 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3547 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3548 } 3549 3550 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3551 { 3552 struct kvm_segment var; 3553 u32 ar; 3554 3555 vmx_get_segment(vcpu, &var, seg); 3556 var.dpl = 0x3; 3557 if (seg == VCPU_SREG_CS) 3558 var.type = 0x3; 3559 ar = vmx_segment_access_rights(&var); 3560 3561 if (var.base != (var.selector << 4)) 3562 return false; 3563 if (var.limit != 0xffff) 3564 return false; 3565 if (ar != 0xf3) 3566 return false; 3567 3568 return true; 3569 } 3570 3571 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3572 { 3573 struct kvm_segment cs; 3574 unsigned int cs_rpl; 3575 3576 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3577 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3578 3579 if (cs.unusable) 3580 return false; 3581 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3582 return false; 3583 if (!cs.s) 3584 return false; 3585 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3586 if (cs.dpl > cs_rpl) 3587 return false; 3588 } else { 3589 if (cs.dpl != cs_rpl) 3590 return false; 3591 } 3592 if (!cs.present) 3593 return false; 3594 3595 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3596 return true; 3597 } 3598 3599 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3600 { 3601 struct kvm_segment ss; 3602 unsigned int ss_rpl; 3603 3604 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3605 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3606 3607 if (ss.unusable) 3608 return true; 3609 if (ss.type != 3 && ss.type != 7) 3610 return false; 3611 if (!ss.s) 3612 return false; 3613 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3614 return false; 3615 if (!ss.present) 3616 return false; 3617 3618 return true; 3619 } 3620 3621 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3622 { 3623 struct kvm_segment var; 3624 unsigned int rpl; 3625 3626 vmx_get_segment(vcpu, &var, seg); 3627 rpl = var.selector & SEGMENT_RPL_MASK; 3628 3629 if (var.unusable) 3630 return true; 3631 if (!var.s) 3632 return false; 3633 if (!var.present) 3634 return false; 3635 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3636 if (var.dpl < rpl) /* DPL < RPL */ 3637 return false; 3638 } 3639 3640 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3641 * rights flags 3642 */ 3643 return true; 3644 } 3645 3646 static bool tr_valid(struct kvm_vcpu *vcpu) 3647 { 3648 struct kvm_segment tr; 3649 3650 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3651 3652 if (tr.unusable) 3653 return false; 3654 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3655 return false; 3656 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3657 return false; 3658 if (!tr.present) 3659 return false; 3660 3661 return true; 3662 } 3663 3664 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3665 { 3666 struct kvm_segment ldtr; 3667 3668 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3669 3670 if (ldtr.unusable) 3671 return true; 3672 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3673 return false; 3674 if (ldtr.type != 2) 3675 return false; 3676 if (!ldtr.present) 3677 return false; 3678 3679 return true; 3680 } 3681 3682 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3683 { 3684 struct kvm_segment cs, ss; 3685 3686 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3687 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3688 3689 return ((cs.selector & SEGMENT_RPL_MASK) == 3690 (ss.selector & SEGMENT_RPL_MASK)); 3691 } 3692 3693 /* 3694 * Check if guest state is valid. Returns true if valid, false if 3695 * not. 3696 * We assume that registers are always usable 3697 */ 3698 bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu) 3699 { 3700 /* real mode guest state checks */ 3701 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3702 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3703 return false; 3704 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3705 return false; 3706 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3707 return false; 3708 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3709 return false; 3710 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3711 return false; 3712 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3713 return false; 3714 } else { 3715 /* protected mode guest state checks */ 3716 if (!cs_ss_rpl_check(vcpu)) 3717 return false; 3718 if (!code_segment_valid(vcpu)) 3719 return false; 3720 if (!stack_segment_valid(vcpu)) 3721 return false; 3722 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3723 return false; 3724 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3725 return false; 3726 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3727 return false; 3728 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3729 return false; 3730 if (!tr_valid(vcpu)) 3731 return false; 3732 if (!ldtr_valid(vcpu)) 3733 return false; 3734 } 3735 /* TODO: 3736 * - Add checks on RIP 3737 * - Add checks on RFLAGS 3738 */ 3739 3740 return true; 3741 } 3742 3743 static int init_rmode_tss(struct kvm *kvm, void __user *ua) 3744 { 3745 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 3746 u16 data; 3747 int i; 3748 3749 for (i = 0; i < 3; i++) { 3750 if (__copy_to_user(ua + PAGE_SIZE * i, zero_page, PAGE_SIZE)) 3751 return -EFAULT; 3752 } 3753 3754 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3755 if (__copy_to_user(ua + TSS_IOPB_BASE_OFFSET, &data, sizeof(u16))) 3756 return -EFAULT; 3757 3758 data = ~0; 3759 if (__copy_to_user(ua + RMODE_TSS_SIZE - 1, &data, sizeof(u8))) 3760 return -EFAULT; 3761 3762 return 0; 3763 } 3764 3765 static int init_rmode_identity_map(struct kvm *kvm) 3766 { 3767 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3768 int i, r = 0; 3769 void __user *uaddr; 3770 u32 tmp; 3771 3772 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3773 mutex_lock(&kvm->slots_lock); 3774 3775 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3776 goto out; 3777 3778 if (!kvm_vmx->ept_identity_map_addr) 3779 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3780 3781 uaddr = __x86_set_memory_region(kvm, 3782 IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3783 kvm_vmx->ept_identity_map_addr, 3784 PAGE_SIZE); 3785 if (IS_ERR(uaddr)) { 3786 r = PTR_ERR(uaddr); 3787 goto out; 3788 } 3789 3790 /* Set up identity-mapping pagetable for EPT in real mode */ 3791 for (i = 0; i < (PAGE_SIZE / sizeof(tmp)); i++) { 3792 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3793 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3794 if (__copy_to_user(uaddr + i * sizeof(tmp), &tmp, sizeof(tmp))) { 3795 r = -EFAULT; 3796 goto out; 3797 } 3798 } 3799 kvm_vmx->ept_identity_pagetable_done = true; 3800 3801 out: 3802 mutex_unlock(&kvm->slots_lock); 3803 return r; 3804 } 3805 3806 static void seg_setup(int seg) 3807 { 3808 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3809 unsigned int ar; 3810 3811 vmcs_write16(sf->selector, 0); 3812 vmcs_writel(sf->base, 0); 3813 vmcs_write32(sf->limit, 0xffff); 3814 ar = 0x93; 3815 if (seg == VCPU_SREG_CS) 3816 ar |= 0x08; /* code segment */ 3817 3818 vmcs_write32(sf->ar_bytes, ar); 3819 } 3820 3821 static int alloc_apic_access_page(struct kvm *kvm) 3822 { 3823 struct page *page; 3824 void __user *hva; 3825 int ret = 0; 3826 3827 mutex_lock(&kvm->slots_lock); 3828 if (kvm->arch.apic_access_memslot_enabled) 3829 goto out; 3830 hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 3831 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); 3832 if (IS_ERR(hva)) { 3833 ret = PTR_ERR(hva); 3834 goto out; 3835 } 3836 3837 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); 3838 if (is_error_page(page)) { 3839 ret = -EFAULT; 3840 goto out; 3841 } 3842 3843 /* 3844 * Do not pin the page in memory, so that memory hot-unplug 3845 * is able to migrate it. 3846 */ 3847 put_page(page); 3848 kvm->arch.apic_access_memslot_enabled = true; 3849 out: 3850 mutex_unlock(&kvm->slots_lock); 3851 return ret; 3852 } 3853 3854 int allocate_vpid(void) 3855 { 3856 int vpid; 3857 3858 if (!enable_vpid) 3859 return 0; 3860 spin_lock(&vmx_vpid_lock); 3861 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3862 if (vpid < VMX_NR_VPIDS) 3863 __set_bit(vpid, vmx_vpid_bitmap); 3864 else 3865 vpid = 0; 3866 spin_unlock(&vmx_vpid_lock); 3867 return vpid; 3868 } 3869 3870 void free_vpid(int vpid) 3871 { 3872 if (!enable_vpid || vpid == 0) 3873 return; 3874 spin_lock(&vmx_vpid_lock); 3875 __clear_bit(vpid, vmx_vpid_bitmap); 3876 spin_unlock(&vmx_vpid_lock); 3877 } 3878 3879 static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) 3880 { 3881 /* 3882 * When KVM is a nested hypervisor on top of Hyper-V and uses 3883 * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR 3884 * bitmap has changed. 3885 */ 3886 if (static_branch_unlikely(&enable_evmcs)) 3887 evmcs_touch_msr_bitmap(); 3888 3889 vmx->nested.force_msr_bitmap_recalc = true; 3890 } 3891 3892 void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 3893 { 3894 struct vcpu_vmx *vmx = to_vmx(vcpu); 3895 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3896 3897 if (!cpu_has_vmx_msr_bitmap()) 3898 return; 3899 3900 vmx_msr_bitmap_l01_changed(vmx); 3901 3902 /* 3903 * Mark the desired intercept state in shadow bitmap, this is needed 3904 * for resync when the MSR filters change. 3905 */ 3906 if (is_valid_passthrough_msr(msr)) { 3907 int idx = possible_passthrough_msr_slot(msr); 3908 3909 if (idx != -ENOENT) { 3910 if (type & MSR_TYPE_R) 3911 clear_bit(idx, vmx->shadow_msr_intercept.read); 3912 if (type & MSR_TYPE_W) 3913 clear_bit(idx, vmx->shadow_msr_intercept.write); 3914 } 3915 } 3916 3917 if ((type & MSR_TYPE_R) && 3918 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) { 3919 vmx_set_msr_bitmap_read(msr_bitmap, msr); 3920 type &= ~MSR_TYPE_R; 3921 } 3922 3923 if ((type & MSR_TYPE_W) && 3924 !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) { 3925 vmx_set_msr_bitmap_write(msr_bitmap, msr); 3926 type &= ~MSR_TYPE_W; 3927 } 3928 3929 if (type & MSR_TYPE_R) 3930 vmx_clear_msr_bitmap_read(msr_bitmap, msr); 3931 3932 if (type & MSR_TYPE_W) 3933 vmx_clear_msr_bitmap_write(msr_bitmap, msr); 3934 } 3935 3936 void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) 3937 { 3938 struct vcpu_vmx *vmx = to_vmx(vcpu); 3939 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3940 3941 if (!cpu_has_vmx_msr_bitmap()) 3942 return; 3943 3944 vmx_msr_bitmap_l01_changed(vmx); 3945 3946 /* 3947 * Mark the desired intercept state in shadow bitmap, this is needed 3948 * for resync when the MSR filter changes. 3949 */ 3950 if (is_valid_passthrough_msr(msr)) { 3951 int idx = possible_passthrough_msr_slot(msr); 3952 3953 if (idx != -ENOENT) { 3954 if (type & MSR_TYPE_R) 3955 set_bit(idx, vmx->shadow_msr_intercept.read); 3956 if (type & MSR_TYPE_W) 3957 set_bit(idx, vmx->shadow_msr_intercept.write); 3958 } 3959 } 3960 3961 if (type & MSR_TYPE_R) 3962 vmx_set_msr_bitmap_read(msr_bitmap, msr); 3963 3964 if (type & MSR_TYPE_W) 3965 vmx_set_msr_bitmap_write(msr_bitmap, msr); 3966 } 3967 3968 static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) 3969 { 3970 unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; 3971 unsigned long read_intercept; 3972 int msr; 3973 3974 read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; 3975 3976 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 3977 unsigned int read_idx = msr / BITS_PER_LONG; 3978 unsigned int write_idx = read_idx + (0x800 / sizeof(long)); 3979 3980 msr_bitmap[read_idx] = read_intercept; 3981 msr_bitmap[write_idx] = ~0ul; 3982 } 3983 } 3984 3985 static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) 3986 { 3987 struct vcpu_vmx *vmx = to_vmx(vcpu); 3988 u8 mode; 3989 3990 if (!cpu_has_vmx_msr_bitmap()) 3991 return; 3992 3993 if (cpu_has_secondary_exec_ctrls() && 3994 (secondary_exec_controls_get(vmx) & 3995 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 3996 mode = MSR_BITMAP_MODE_X2APIC; 3997 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 3998 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 3999 } else { 4000 mode = 0; 4001 } 4002 4003 if (mode == vmx->x2apic_msr_bitmap_mode) 4004 return; 4005 4006 vmx->x2apic_msr_bitmap_mode = mode; 4007 4008 vmx_reset_x2apic_msrs(vcpu, mode); 4009 4010 /* 4011 * TPR reads and writes can be virtualized even if virtual interrupt 4012 * delivery is not in use. 4013 */ 4014 vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW, 4015 !(mode & MSR_BITMAP_MODE_X2APIC)); 4016 4017 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 4018 vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW); 4019 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 4020 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 4021 if (enable_ipiv) 4022 vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW); 4023 } 4024 } 4025 4026 void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) 4027 { 4028 struct vcpu_vmx *vmx = to_vmx(vcpu); 4029 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 4030 u32 i; 4031 4032 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag); 4033 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag); 4034 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag); 4035 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag); 4036 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) { 4037 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 4038 vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 4039 } 4040 } 4041 4042 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 4043 { 4044 struct vcpu_vmx *vmx = to_vmx(vcpu); 4045 void *vapic_page; 4046 u32 vppr; 4047 int rvi; 4048 4049 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || 4050 !nested_cpu_has_vid(get_vmcs12(vcpu)) || 4051 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) 4052 return false; 4053 4054 rvi = vmx_get_rvi(); 4055 4056 vapic_page = vmx->nested.virtual_apic_map.hva; 4057 vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); 4058 4059 return ((rvi & 0xf0) > (vppr & 0xf0)); 4060 } 4061 4062 static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) 4063 { 4064 struct vcpu_vmx *vmx = to_vmx(vcpu); 4065 u32 i; 4066 4067 /* 4068 * Redo intercept permissions for MSRs that KVM is passing through to 4069 * the guest. Disabling interception will check the new MSR filter and 4070 * ensure that KVM enables interception if usersepace wants to filter 4071 * the MSR. MSRs that KVM is already intercepting don't need to be 4072 * refreshed since KVM is going to intercept them regardless of what 4073 * userspace wants. 4074 */ 4075 for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) { 4076 u32 msr = vmx_possible_passthrough_msrs[i]; 4077 4078 if (!test_bit(i, vmx->shadow_msr_intercept.read)) 4079 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R); 4080 4081 if (!test_bit(i, vmx->shadow_msr_intercept.write)) 4082 vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W); 4083 } 4084 4085 /* PT MSRs can be passed through iff PT is exposed to the guest. */ 4086 if (vmx_pt_mode_is_host_guest()) 4087 pt_update_intercept_for_msr(vcpu); 4088 } 4089 4090 static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 4091 int pi_vec) 4092 { 4093 #ifdef CONFIG_SMP 4094 if (vcpu->mode == IN_GUEST_MODE) { 4095 /* 4096 * The vector of the virtual has already been set in the PIR. 4097 * Send a notification event to deliver the virtual interrupt 4098 * unless the vCPU is the currently running vCPU, i.e. the 4099 * event is being sent from a fastpath VM-Exit handler, in 4100 * which case the PIR will be synced to the vIRR before 4101 * re-entering the guest. 4102 * 4103 * When the target is not the running vCPU, the following 4104 * possibilities emerge: 4105 * 4106 * Case 1: vCPU stays in non-root mode. Sending a notification 4107 * event posts the interrupt to the vCPU. 4108 * 4109 * Case 2: vCPU exits to root mode and is still runnable. The 4110 * PIR will be synced to the vIRR before re-entering the guest. 4111 * Sending a notification event is ok as the host IRQ handler 4112 * will ignore the spurious event. 4113 * 4114 * Case 3: vCPU exits to root mode and is blocked. vcpu_block() 4115 * has already synced PIR to vIRR and never blocks the vCPU if 4116 * the vIRR is not empty. Therefore, a blocked vCPU here does 4117 * not wait for any requested interrupts in PIR, and sending a 4118 * notification event also results in a benign, spurious event. 4119 */ 4120 4121 if (vcpu != kvm_get_running_vcpu()) 4122 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); 4123 return; 4124 } 4125 #endif 4126 /* 4127 * The vCPU isn't in the guest; wake the vCPU in case it is blocking, 4128 * otherwise do nothing as KVM will grab the highest priority pending 4129 * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest(). 4130 */ 4131 kvm_vcpu_wake_up(vcpu); 4132 } 4133 4134 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4135 int vector) 4136 { 4137 struct vcpu_vmx *vmx = to_vmx(vcpu); 4138 4139 if (is_guest_mode(vcpu) && 4140 vector == vmx->nested.posted_intr_nv) { 4141 /* 4142 * If a posted intr is not recognized by hardware, 4143 * we will accomplish it in the next vmentry. 4144 */ 4145 vmx->nested.pi_pending = true; 4146 kvm_make_request(KVM_REQ_EVENT, vcpu); 4147 4148 /* 4149 * This pairs with the smp_mb_*() after setting vcpu->mode in 4150 * vcpu_enter_guest() to guarantee the vCPU sees the event 4151 * request if triggering a posted interrupt "fails" because 4152 * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as 4153 * the smb_wmb() in kvm_make_request() only ensures everything 4154 * done before making the request is visible when the request 4155 * is visible, it doesn't ensure ordering between the store to 4156 * vcpu->requests and the load from vcpu->mode. 4157 */ 4158 smp_mb__after_atomic(); 4159 4160 /* the PIR and ON have been set by L1. */ 4161 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR); 4162 return 0; 4163 } 4164 return -1; 4165 } 4166 /* 4167 * Send interrupt to vcpu via posted interrupt way. 4168 * 1. If target vcpu is running(non-root mode), send posted interrupt 4169 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4170 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4171 * interrupt from PIR in next vmentry. 4172 */ 4173 static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4174 { 4175 struct vcpu_vmx *vmx = to_vmx(vcpu); 4176 int r; 4177 4178 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4179 if (!r) 4180 return 0; 4181 4182 /* Note, this is called iff the local APIC is in-kernel. */ 4183 if (!vcpu->arch.apic->apicv_active) 4184 return -1; 4185 4186 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4187 return 0; 4188 4189 /* If a previous notification has sent the IPI, nothing to do. */ 4190 if (pi_test_and_set_on(&vmx->pi_desc)) 4191 return 0; 4192 4193 /* 4194 * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*() 4195 * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is 4196 * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a 4197 * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE. 4198 */ 4199 kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR); 4200 return 0; 4201 } 4202 4203 static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 4204 int trig_mode, int vector) 4205 { 4206 struct kvm_vcpu *vcpu = apic->vcpu; 4207 4208 if (vmx_deliver_posted_interrupt(vcpu, vector)) { 4209 kvm_lapic_set_irr(vector, apic); 4210 kvm_make_request(KVM_REQ_EVENT, vcpu); 4211 kvm_vcpu_kick(vcpu); 4212 } else { 4213 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, 4214 trig_mode, vector); 4215 } 4216 } 4217 4218 /* 4219 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4220 * will not change in the lifetime of the guest. 4221 * Note that host-state that does change is set elsewhere. E.g., host-state 4222 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4223 */ 4224 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4225 { 4226 u32 low32, high32; 4227 unsigned long tmpl; 4228 unsigned long cr0, cr3, cr4; 4229 4230 cr0 = read_cr0(); 4231 WARN_ON(cr0 & X86_CR0_TS); 4232 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 4233 4234 /* 4235 * Save the most likely value for this task's CR3 in the VMCS. 4236 * We can't use __get_current_cr3_fast() because we're not atomic. 4237 */ 4238 cr3 = __read_cr3(); 4239 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 4240 vmx->loaded_vmcs->host_state.cr3 = cr3; 4241 4242 /* Save the most likely value for this task's CR4 in the VMCS. */ 4243 cr4 = cr4_read_shadow(); 4244 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4245 vmx->loaded_vmcs->host_state.cr4 = cr4; 4246 4247 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4248 #ifdef CONFIG_X86_64 4249 /* 4250 * Load null selectors, so we can avoid reloading them in 4251 * vmx_prepare_switch_to_host(), in case userspace uses 4252 * the null selectors too (the expected case). 4253 */ 4254 vmcs_write16(HOST_DS_SELECTOR, 0); 4255 vmcs_write16(HOST_ES_SELECTOR, 0); 4256 #else 4257 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4258 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4259 #endif 4260 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4261 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4262 4263 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 4264 4265 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 4266 4267 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4268 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4269 4270 /* 4271 * SYSENTER is used for 32-bit system calls on either 32-bit or 4272 * 64-bit kernels. It is always zero If neither is allowed, otherwise 4273 * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may 4274 * have already done so!). 4275 */ 4276 if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32)) 4277 vmcs_writel(HOST_IA32_SYSENTER_ESP, 0); 4278 4279 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4280 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4281 4282 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4283 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4284 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4285 } 4286 4287 if (cpu_has_load_ia32_efer()) 4288 vmcs_write64(HOST_IA32_EFER, host_efer); 4289 } 4290 4291 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4292 { 4293 struct kvm_vcpu *vcpu = &vmx->vcpu; 4294 4295 vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS & 4296 ~vcpu->arch.cr4_guest_rsvd_bits; 4297 if (!enable_ept) { 4298 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS; 4299 vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS; 4300 } 4301 if (is_guest_mode(&vmx->vcpu)) 4302 vcpu->arch.cr4_guest_owned_bits &= 4303 ~get_vmcs12(vcpu)->cr4_guest_host_mask; 4304 vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); 4305 } 4306 4307 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4308 { 4309 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4310 4311 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 4312 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4313 4314 if (!enable_vnmi) 4315 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 4316 4317 if (!enable_preemption_timer) 4318 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 4319 4320 return pin_based_exec_ctrl; 4321 } 4322 4323 static u32 vmx_vmentry_ctrl(void) 4324 { 4325 u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; 4326 4327 if (vmx_pt_mode_is_system()) 4328 vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | 4329 VM_ENTRY_LOAD_IA32_RTIT_CTL); 4330 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4331 return vmentry_ctrl & 4332 ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); 4333 } 4334 4335 static u32 vmx_vmexit_ctrl(void) 4336 { 4337 u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; 4338 4339 if (vmx_pt_mode_is_system()) 4340 vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | 4341 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4342 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4343 return vmexit_ctrl & 4344 ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4345 } 4346 4347 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 4348 { 4349 struct vcpu_vmx *vmx = to_vmx(vcpu); 4350 4351 if (is_guest_mode(vcpu)) { 4352 vmx->nested.update_vmcs01_apicv_status = true; 4353 return; 4354 } 4355 4356 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4357 4358 if (kvm_vcpu_apicv_active(vcpu)) { 4359 secondary_exec_controls_setbit(vmx, 4360 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4361 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4362 if (enable_ipiv) 4363 tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4364 } else { 4365 secondary_exec_controls_clearbit(vmx, 4366 SECONDARY_EXEC_APIC_REGISTER_VIRT | 4367 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4368 if (enable_ipiv) 4369 tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT); 4370 } 4371 4372 vmx_update_msr_bitmap_x2apic(vcpu); 4373 } 4374 4375 static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4376 { 4377 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4378 4379 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4380 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4381 4382 if (!cpu_need_tpr_shadow(&vmx->vcpu)) { 4383 exec_control &= ~CPU_BASED_TPR_SHADOW; 4384 #ifdef CONFIG_X86_64 4385 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4386 CPU_BASED_CR8_LOAD_EXITING; 4387 #endif 4388 } 4389 if (!enable_ept) 4390 exec_control |= CPU_BASED_CR3_STORE_EXITING | 4391 CPU_BASED_CR3_LOAD_EXITING | 4392 CPU_BASED_INVLPG_EXITING; 4393 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 4394 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 4395 CPU_BASED_MONITOR_EXITING); 4396 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 4397 exec_control &= ~CPU_BASED_HLT_EXITING; 4398 return exec_control; 4399 } 4400 4401 static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx) 4402 { 4403 u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl; 4404 4405 /* 4406 * IPI virtualization relies on APICv. Disable IPI virtualization if 4407 * APICv is inhibited. 4408 */ 4409 if (!enable_ipiv || !kvm_vcpu_apicv_active(&vmx->vcpu)) 4410 exec_control &= ~TERTIARY_EXEC_IPI_VIRT; 4411 4412 return exec_control; 4413 } 4414 4415 /* 4416 * Adjust a single secondary execution control bit to intercept/allow an 4417 * instruction in the guest. This is usually done based on whether or not a 4418 * feature has been exposed to the guest in order to correctly emulate faults. 4419 */ 4420 static inline void 4421 vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, 4422 u32 control, bool enabled, bool exiting) 4423 { 4424 /* 4425 * If the control is for an opt-in feature, clear the control if the 4426 * feature is not exposed to the guest, i.e. not enabled. If the 4427 * control is opt-out, i.e. an exiting control, clear the control if 4428 * the feature _is_ exposed to the guest, i.e. exiting/interception is 4429 * disabled for the associated instruction. Note, the caller is 4430 * responsible presetting exec_control to set all supported bits. 4431 */ 4432 if (enabled == exiting) 4433 *exec_control &= ~control; 4434 4435 /* 4436 * Update the nested MSR settings so that a nested VMM can/can't set 4437 * controls for features that are/aren't exposed to the guest. 4438 */ 4439 if (nested) { 4440 if (enabled) 4441 vmx->nested.msrs.secondary_ctls_high |= control; 4442 else 4443 vmx->nested.msrs.secondary_ctls_high &= ~control; 4444 } 4445 } 4446 4447 /* 4448 * Wrapper macro for the common case of adjusting a secondary execution control 4449 * based on a single guest CPUID bit, with a dedicated feature bit. This also 4450 * verifies that the control is actually supported by KVM and hardware. 4451 */ 4452 #define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \ 4453 ({ \ 4454 bool __enabled; \ 4455 \ 4456 if (cpu_has_vmx_##name()) { \ 4457 __enabled = guest_cpuid_has(&(vmx)->vcpu, \ 4458 X86_FEATURE_##feat_name); \ 4459 vmx_adjust_secondary_exec_control(vmx, exec_control, \ 4460 SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \ 4461 } \ 4462 }) 4463 4464 /* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */ 4465 #define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \ 4466 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false) 4467 4468 #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ 4469 vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) 4470 4471 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4472 { 4473 struct kvm_vcpu *vcpu = &vmx->vcpu; 4474 4475 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4476 4477 if (vmx_pt_mode_is_system()) 4478 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4479 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4480 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4481 if (vmx->vpid == 0) 4482 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4483 if (!enable_ept) { 4484 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4485 enable_unrestricted_guest = 0; 4486 } 4487 if (!enable_unrestricted_guest) 4488 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4489 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4490 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4491 if (!kvm_vcpu_apicv_active(vcpu)) 4492 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4493 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4494 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4495 4496 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4497 * in vmx_set_cr4. */ 4498 exec_control &= ~SECONDARY_EXEC_DESC; 4499 4500 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4501 (handle_vmptrld). 4502 We can NOT enable shadow_vmcs here because we don't have yet 4503 a current VMCS12 4504 */ 4505 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4506 4507 /* 4508 * PML is enabled/disabled when dirty logging of memsmlots changes, but 4509 * it needs to be set here when dirty logging is already active, e.g. 4510 * if this vCPU was created after dirty logging was enabled. 4511 */ 4512 if (!vcpu->kvm->arch.cpu_dirty_logging_count) 4513 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4514 4515 if (cpu_has_vmx_xsaves()) { 4516 /* Exposing XSAVES only when XSAVE is exposed */ 4517 bool xsaves_enabled = 4518 boot_cpu_has(X86_FEATURE_XSAVE) && 4519 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 4520 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); 4521 4522 vcpu->arch.xsaves_enabled = xsaves_enabled; 4523 4524 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4525 SECONDARY_EXEC_XSAVES, 4526 xsaves_enabled, false); 4527 } 4528 4529 /* 4530 * RDPID is also gated by ENABLE_RDTSCP, turn on the control if either 4531 * feature is exposed to the guest. This creates a virtualization hole 4532 * if both are supported in hardware but only one is exposed to the 4533 * guest, but letting the guest execute RDTSCP or RDPID when either one 4534 * is advertised is preferable to emulating the advertised instruction 4535 * in KVM on #UD, and obviously better than incorrectly injecting #UD. 4536 */ 4537 if (cpu_has_vmx_rdtscp()) { 4538 bool rdpid_or_rdtscp_enabled = 4539 guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) || 4540 guest_cpuid_has(vcpu, X86_FEATURE_RDPID); 4541 4542 vmx_adjust_secondary_exec_control(vmx, &exec_control, 4543 SECONDARY_EXEC_ENABLE_RDTSCP, 4544 rdpid_or_rdtscp_enabled, false); 4545 } 4546 vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID); 4547 4548 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND); 4549 vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED); 4550 4551 vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG, 4552 ENABLE_USR_WAIT_PAUSE, false); 4553 4554 if (!vcpu->kvm->arch.bus_lock_detection_enabled) 4555 exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; 4556 4557 if (!kvm_notify_vmexit_enabled(vcpu->kvm)) 4558 exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING; 4559 4560 return exec_control; 4561 } 4562 4563 static inline int vmx_get_pid_table_order(struct kvm *kvm) 4564 { 4565 return get_order(kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table)); 4566 } 4567 4568 static int vmx_alloc_ipiv_pid_table(struct kvm *kvm) 4569 { 4570 struct page *pages; 4571 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4572 4573 if (!irqchip_in_kernel(kvm) || !enable_ipiv) 4574 return 0; 4575 4576 if (kvm_vmx->pid_table) 4577 return 0; 4578 4579 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, vmx_get_pid_table_order(kvm)); 4580 if (!pages) 4581 return -ENOMEM; 4582 4583 kvm_vmx->pid_table = (void *)page_address(pages); 4584 return 0; 4585 } 4586 4587 static int vmx_vcpu_precreate(struct kvm *kvm) 4588 { 4589 return vmx_alloc_ipiv_pid_table(kvm); 4590 } 4591 4592 #define VMX_XSS_EXIT_BITMAP 0 4593 4594 static void init_vmcs(struct vcpu_vmx *vmx) 4595 { 4596 struct kvm *kvm = vmx->vcpu.kvm; 4597 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 4598 4599 if (nested) 4600 nested_vmx_set_vmcs_shadowing_bitmap(); 4601 4602 if (cpu_has_vmx_msr_bitmap()) 4603 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4604 4605 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); /* 22.3.1.5 */ 4606 4607 /* Control */ 4608 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4609 4610 exec_controls_set(vmx, vmx_exec_control(vmx)); 4611 4612 if (cpu_has_secondary_exec_ctrls()) 4613 secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); 4614 4615 if (cpu_has_tertiary_exec_ctrls()) 4616 tertiary_exec_controls_set(vmx, vmx_tertiary_exec_control(vmx)); 4617 4618 if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) { 4619 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4620 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4621 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4622 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4623 4624 vmcs_write16(GUEST_INTR_STATUS, 0); 4625 4626 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4627 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4628 } 4629 4630 if (vmx_can_use_ipiv(&vmx->vcpu)) { 4631 vmcs_write64(PID_POINTER_TABLE, __pa(kvm_vmx->pid_table)); 4632 vmcs_write16(LAST_PID_POINTER_INDEX, kvm->arch.max_vcpu_ids - 1); 4633 } 4634 4635 if (!kvm_pause_in_guest(kvm)) { 4636 vmcs_write32(PLE_GAP, ple_gap); 4637 vmx->ple_window = ple_window; 4638 vmx->ple_window_dirty = true; 4639 } 4640 4641 if (kvm_notify_vmexit_enabled(kvm)) 4642 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 4643 4644 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4645 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4646 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4647 4648 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4649 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4650 vmx_set_constant_host_state(vmx); 4651 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4652 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4653 4654 if (cpu_has_vmx_vmfunc()) 4655 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4656 4657 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4658 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4659 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4660 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4661 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4662 4663 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4664 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4665 4666 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4667 4668 /* 22.2.1, 20.8.1 */ 4669 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4670 4671 vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS; 4672 vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits); 4673 4674 set_cr4_guest_host_mask(vmx); 4675 4676 if (vmx->vpid != 0) 4677 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4678 4679 if (cpu_has_vmx_xsaves()) 4680 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4681 4682 if (enable_pml) { 4683 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4684 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 4685 } 4686 4687 vmx_write_encls_bitmap(&vmx->vcpu, NULL); 4688 4689 if (vmx_pt_mode_is_host_guest()) { 4690 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4691 /* Bit[6~0] are forced to 1, writes are ignored. */ 4692 vmx->pt_desc.guest.output_mask = 0x7F; 4693 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4694 } 4695 4696 vmcs_write32(GUEST_SYSENTER_CS, 0); 4697 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4698 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4699 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4700 4701 if (cpu_has_vmx_tpr_shadow()) { 4702 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4703 if (cpu_need_tpr_shadow(&vmx->vcpu)) 4704 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4705 __pa(vmx->vcpu.arch.apic->regs)); 4706 vmcs_write32(TPR_THRESHOLD, 0); 4707 } 4708 4709 vmx_setup_uret_msrs(vmx); 4710 } 4711 4712 static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4713 { 4714 struct vcpu_vmx *vmx = to_vmx(vcpu); 4715 4716 init_vmcs(vmx); 4717 4718 if (nested) 4719 memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs)); 4720 4721 vcpu_setup_sgx_lepubkeyhash(vcpu); 4722 4723 vmx->nested.posted_intr_nv = -1; 4724 vmx->nested.vmxon_ptr = INVALID_GPA; 4725 vmx->nested.current_vmptr = INVALID_GPA; 4726 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 4727 4728 vcpu->arch.microcode_version = 0x100000000ULL; 4729 vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED; 4730 4731 /* 4732 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 4733 * or POSTED_INTR_WAKEUP_VECTOR. 4734 */ 4735 vmx->pi_desc.nv = POSTED_INTR_VECTOR; 4736 vmx->pi_desc.sn = 1; 4737 } 4738 4739 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4740 { 4741 struct vcpu_vmx *vmx = to_vmx(vcpu); 4742 4743 if (!init_event) 4744 __vmx_vcpu_reset(vcpu); 4745 4746 vmx->rmode.vm86_active = 0; 4747 vmx->spec_ctrl = 0; 4748 4749 vmx->msr_ia32_umwait_control = 0; 4750 4751 vmx->hv_deadline_tsc = -1; 4752 kvm_set_cr8(vcpu, 0); 4753 4754 vmx_segment_cache_clear(vmx); 4755 kvm_register_mark_available(vcpu, VCPU_EXREG_SEGMENTS); 4756 4757 seg_setup(VCPU_SREG_CS); 4758 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4759 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4760 4761 seg_setup(VCPU_SREG_DS); 4762 seg_setup(VCPU_SREG_ES); 4763 seg_setup(VCPU_SREG_FS); 4764 seg_setup(VCPU_SREG_GS); 4765 seg_setup(VCPU_SREG_SS); 4766 4767 vmcs_write16(GUEST_TR_SELECTOR, 0); 4768 vmcs_writel(GUEST_TR_BASE, 0); 4769 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4770 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4771 4772 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4773 vmcs_writel(GUEST_LDTR_BASE, 0); 4774 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4775 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4776 4777 vmcs_writel(GUEST_GDTR_BASE, 0); 4778 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4779 4780 vmcs_writel(GUEST_IDTR_BASE, 0); 4781 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4782 4783 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4784 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4785 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4786 if (kvm_mpx_supported()) 4787 vmcs_write64(GUEST_BNDCFGS, 0); 4788 4789 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4790 4791 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4792 4793 vpid_sync_context(vmx->vpid); 4794 4795 vmx_update_fb_clear_dis(vcpu, vmx); 4796 } 4797 4798 static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) 4799 { 4800 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 4801 } 4802 4803 static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu) 4804 { 4805 if (!enable_vnmi || 4806 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4807 vmx_enable_irq_window(vcpu); 4808 return; 4809 } 4810 4811 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 4812 } 4813 4814 static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) 4815 { 4816 struct vcpu_vmx *vmx = to_vmx(vcpu); 4817 uint32_t intr; 4818 int irq = vcpu->arch.interrupt.nr; 4819 4820 trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected); 4821 4822 ++vcpu->stat.irq_injections; 4823 if (vmx->rmode.vm86_active) { 4824 int inc_eip = 0; 4825 if (vcpu->arch.interrupt.soft) 4826 inc_eip = vcpu->arch.event_exit_inst_len; 4827 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4828 return; 4829 } 4830 intr = irq | INTR_INFO_VALID_MASK; 4831 if (vcpu->arch.interrupt.soft) { 4832 intr |= INTR_TYPE_SOFT_INTR; 4833 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4834 vmx->vcpu.arch.event_exit_inst_len); 4835 } else 4836 intr |= INTR_TYPE_EXT_INTR; 4837 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4838 4839 vmx_clear_hlt(vcpu); 4840 } 4841 4842 static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4843 { 4844 struct vcpu_vmx *vmx = to_vmx(vcpu); 4845 4846 if (!enable_vnmi) { 4847 /* 4848 * Tracking the NMI-blocked state in software is built upon 4849 * finding the next open IRQ window. This, in turn, depends on 4850 * well-behaving guests: They have to keep IRQs disabled at 4851 * least as long as the NMI handler runs. Otherwise we may 4852 * cause NMI nesting, maybe breaking the guest. But as this is 4853 * highly unlikely, we can live with the residual risk. 4854 */ 4855 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 4856 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4857 } 4858 4859 ++vcpu->stat.nmi_injections; 4860 vmx->loaded_vmcs->nmi_known_unmasked = false; 4861 4862 if (vmx->rmode.vm86_active) { 4863 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 4864 return; 4865 } 4866 4867 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4868 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4869 4870 vmx_clear_hlt(vcpu); 4871 } 4872 4873 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4874 { 4875 struct vcpu_vmx *vmx = to_vmx(vcpu); 4876 bool masked; 4877 4878 if (!enable_vnmi) 4879 return vmx->loaded_vmcs->soft_vnmi_blocked; 4880 if (vmx->loaded_vmcs->nmi_known_unmasked) 4881 return false; 4882 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4883 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4884 return masked; 4885 } 4886 4887 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4888 { 4889 struct vcpu_vmx *vmx = to_vmx(vcpu); 4890 4891 if (!enable_vnmi) { 4892 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 4893 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 4894 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4895 } 4896 } else { 4897 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4898 if (masked) 4899 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4900 GUEST_INTR_STATE_NMI); 4901 else 4902 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4903 GUEST_INTR_STATE_NMI); 4904 } 4905 } 4906 4907 bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) 4908 { 4909 if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 4910 return false; 4911 4912 if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 4913 return true; 4914 4915 return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4916 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | 4917 GUEST_INTR_STATE_NMI)); 4918 } 4919 4920 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4921 { 4922 if (to_vmx(vcpu)->nested.nested_run_pending) 4923 return -EBUSY; 4924 4925 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 4926 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) 4927 return -EBUSY; 4928 4929 return !vmx_nmi_blocked(vcpu); 4930 } 4931 4932 bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) 4933 { 4934 if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 4935 return false; 4936 4937 return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) || 4938 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4939 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 4940 } 4941 4942 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4943 { 4944 if (to_vmx(vcpu)->nested.nested_run_pending) 4945 return -EBUSY; 4946 4947 /* 4948 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 4949 * e.g. if the IRQ arrived asynchronously after checking nested events. 4950 */ 4951 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) 4952 return -EBUSY; 4953 4954 return !vmx_interrupt_blocked(vcpu); 4955 } 4956 4957 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 4958 { 4959 void __user *ret; 4960 4961 if (enable_unrestricted_guest) 4962 return 0; 4963 4964 mutex_lock(&kvm->slots_lock); 4965 ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 4966 PAGE_SIZE * 3); 4967 mutex_unlock(&kvm->slots_lock); 4968 4969 if (IS_ERR(ret)) 4970 return PTR_ERR(ret); 4971 4972 to_kvm_vmx(kvm)->tss_addr = addr; 4973 4974 return init_rmode_tss(kvm, ret); 4975 } 4976 4977 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 4978 { 4979 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 4980 return 0; 4981 } 4982 4983 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 4984 { 4985 switch (vec) { 4986 case BP_VECTOR: 4987 /* 4988 * Update instruction length as we may reinject the exception 4989 * from user space while in guest debugging mode. 4990 */ 4991 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 4992 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4993 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 4994 return false; 4995 fallthrough; 4996 case DB_VECTOR: 4997 return !(vcpu->guest_debug & 4998 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)); 4999 case DE_VECTOR: 5000 case OF_VECTOR: 5001 case BR_VECTOR: 5002 case UD_VECTOR: 5003 case DF_VECTOR: 5004 case SS_VECTOR: 5005 case GP_VECTOR: 5006 case MF_VECTOR: 5007 return true; 5008 } 5009 return false; 5010 } 5011 5012 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5013 int vec, u32 err_code) 5014 { 5015 /* 5016 * Instruction with address size override prefix opcode 0x67 5017 * Cause the #SS fault with 0 error code in VM86 mode. 5018 */ 5019 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5020 if (kvm_emulate_instruction(vcpu, 0)) { 5021 if (vcpu->arch.halt_request) { 5022 vcpu->arch.halt_request = 0; 5023 return kvm_emulate_halt_noskip(vcpu); 5024 } 5025 return 1; 5026 } 5027 return 0; 5028 } 5029 5030 /* 5031 * Forward all other exceptions that are valid in real mode. 5032 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5033 * the required debugging infrastructure rework. 5034 */ 5035 kvm_queue_exception(vcpu, vec); 5036 return 1; 5037 } 5038 5039 static int handle_machine_check(struct kvm_vcpu *vcpu) 5040 { 5041 /* handled by vmx_vcpu_run() */ 5042 return 1; 5043 } 5044 5045 /* 5046 * If the host has split lock detection disabled, then #AC is 5047 * unconditionally injected into the guest, which is the pre split lock 5048 * detection behaviour. 5049 * 5050 * If the host has split lock detection enabled then #AC is 5051 * only injected into the guest when: 5052 * - Guest CPL == 3 (user mode) 5053 * - Guest has #AC detection enabled in CR0 5054 * - Guest EFLAGS has AC bit set 5055 */ 5056 bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu) 5057 { 5058 if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) 5059 return true; 5060 5061 return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) && 5062 (kvm_get_rflags(vcpu) & X86_EFLAGS_AC); 5063 } 5064 5065 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 5066 { 5067 struct vcpu_vmx *vmx = to_vmx(vcpu); 5068 struct kvm_run *kvm_run = vcpu->run; 5069 u32 intr_info, ex_no, error_code; 5070 unsigned long cr2, dr6; 5071 u32 vect_info; 5072 5073 vect_info = vmx->idt_vectoring_info; 5074 intr_info = vmx_get_intr_info(vcpu); 5075 5076 if (is_machine_check(intr_info) || is_nmi(intr_info)) 5077 return 1; /* handled by handle_exception_nmi_irqoff() */ 5078 5079 /* 5080 * Queue the exception here instead of in handle_nm_fault_irqoff(). 5081 * This ensures the nested_vmx check is not skipped so vmexit can 5082 * be reflected to L1 (when it intercepts #NM) before reaching this 5083 * point. 5084 */ 5085 if (is_nm_fault(intr_info)) { 5086 kvm_queue_exception(vcpu, NM_VECTOR); 5087 return 1; 5088 } 5089 5090 if (is_invalid_opcode(intr_info)) 5091 return handle_ud(vcpu); 5092 5093 error_code = 0; 5094 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5095 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5096 5097 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 5098 WARN_ON_ONCE(!enable_vmware_backdoor); 5099 5100 /* 5101 * VMware backdoor emulation on #GP interception only handles 5102 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 5103 * error code on #GP. 5104 */ 5105 if (error_code) { 5106 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 5107 return 1; 5108 } 5109 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 5110 } 5111 5112 /* 5113 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5114 * MMIO, it is better to report an internal error. 5115 * See the comments in vmx_handle_exit. 5116 */ 5117 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5118 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5119 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5120 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5121 vcpu->run->internal.ndata = 4; 5122 vcpu->run->internal.data[0] = vect_info; 5123 vcpu->run->internal.data[1] = intr_info; 5124 vcpu->run->internal.data[2] = error_code; 5125 vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; 5126 return 0; 5127 } 5128 5129 if (is_page_fault(intr_info)) { 5130 cr2 = vmx_get_exit_qual(vcpu); 5131 if (enable_ept && !vcpu->arch.apf.host_apf_flags) { 5132 /* 5133 * EPT will cause page fault only if we need to 5134 * detect illegal GPAs. 5135 */ 5136 WARN_ON_ONCE(!allow_smaller_maxphyaddr); 5137 kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); 5138 return 1; 5139 } else 5140 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 5141 } 5142 5143 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5144 5145 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5146 return handle_rmode_exception(vcpu, ex_no, error_code); 5147 5148 switch (ex_no) { 5149 case DB_VECTOR: 5150 dr6 = vmx_get_exit_qual(vcpu); 5151 if (!(vcpu->guest_debug & 5152 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5153 /* 5154 * If the #DB was due to ICEBP, a.k.a. INT1, skip the 5155 * instruction. ICEBP generates a trap-like #DB, but 5156 * despite its interception control being tied to #DB, 5157 * is an instruction intercept, i.e. the VM-Exit occurs 5158 * on the ICEBP itself. Note, skipping ICEBP also 5159 * clears STI and MOVSS blocking. 5160 * 5161 * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS 5162 * if single-step is enabled in RFLAGS and STI or MOVSS 5163 * blocking is active, as the CPU doesn't set the bit 5164 * on VM-Exit due to #DB interception. VM-Entry has a 5165 * consistency check that a single-step #DB is pending 5166 * in this scenario as the previous instruction cannot 5167 * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV 5168 * don't modify RFLAGS), therefore the one instruction 5169 * delay when activating single-step breakpoints must 5170 * have already expired. Note, the CPU sets/clears BS 5171 * as appropriate for all other VM-Exits types. 5172 */ 5173 if (is_icebp(intr_info)) 5174 WARN_ON(!skip_emulated_instruction(vcpu)); 5175 else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) && 5176 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 5177 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS))) 5178 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 5179 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS); 5180 5181 kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); 5182 return 1; 5183 } 5184 kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; 5185 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5186 fallthrough; 5187 case BP_VECTOR: 5188 /* 5189 * Update instruction length as we may reinject #BP from 5190 * user space while in guest debugging mode. Reading it for 5191 * #DB as well causes no harm, it is not used in that case. 5192 */ 5193 vmx->vcpu.arch.event_exit_inst_len = 5194 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5195 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5196 kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5197 kvm_run->debug.arch.exception = ex_no; 5198 break; 5199 case AC_VECTOR: 5200 if (vmx_guest_inject_ac(vcpu)) { 5201 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5202 return 1; 5203 } 5204 5205 /* 5206 * Handle split lock. Depending on detection mode this will 5207 * either warn and disable split lock detection for this 5208 * task or force SIGBUS on it. 5209 */ 5210 if (handle_guest_split_lock(kvm_rip_read(vcpu))) 5211 return 1; 5212 fallthrough; 5213 default: 5214 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5215 kvm_run->ex.exception = ex_no; 5216 kvm_run->ex.error_code = error_code; 5217 break; 5218 } 5219 return 0; 5220 } 5221 5222 static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) 5223 { 5224 ++vcpu->stat.irq_exits; 5225 return 1; 5226 } 5227 5228 static int handle_triple_fault(struct kvm_vcpu *vcpu) 5229 { 5230 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5231 vcpu->mmio_needed = 0; 5232 return 0; 5233 } 5234 5235 static int handle_io(struct kvm_vcpu *vcpu) 5236 { 5237 unsigned long exit_qualification; 5238 int size, in, string; 5239 unsigned port; 5240 5241 exit_qualification = vmx_get_exit_qual(vcpu); 5242 string = (exit_qualification & 16) != 0; 5243 5244 ++vcpu->stat.io_exits; 5245 5246 if (string) 5247 return kvm_emulate_instruction(vcpu, 0); 5248 5249 port = exit_qualification >> 16; 5250 size = (exit_qualification & 7) + 1; 5251 in = (exit_qualification & 8) != 0; 5252 5253 return kvm_fast_pio(vcpu, size, port, in); 5254 } 5255 5256 static void 5257 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5258 { 5259 /* 5260 * Patch in the VMCALL instruction: 5261 */ 5262 hypercall[0] = 0x0f; 5263 hypercall[1] = 0x01; 5264 hypercall[2] = 0xc1; 5265 } 5266 5267 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5268 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5269 { 5270 if (is_guest_mode(vcpu)) { 5271 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5272 unsigned long orig_val = val; 5273 5274 /* 5275 * We get here when L2 changed cr0 in a way that did not change 5276 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5277 * but did change L0 shadowed bits. So we first calculate the 5278 * effective cr0 value that L1 would like to write into the 5279 * hardware. It consists of the L2-owned bits from the new 5280 * value combined with the L1-owned bits from L1's guest_cr0. 5281 */ 5282 val = (val & ~vmcs12->cr0_guest_host_mask) | 5283 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5284 5285 if (!nested_guest_cr0_valid(vcpu, val)) 5286 return 1; 5287 5288 if (kvm_set_cr0(vcpu, val)) 5289 return 1; 5290 vmcs_writel(CR0_READ_SHADOW, orig_val); 5291 return 0; 5292 } else { 5293 if (to_vmx(vcpu)->nested.vmxon && 5294 !nested_host_cr0_valid(vcpu, val)) 5295 return 1; 5296 5297 return kvm_set_cr0(vcpu, val); 5298 } 5299 } 5300 5301 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5302 { 5303 if (is_guest_mode(vcpu)) { 5304 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5305 unsigned long orig_val = val; 5306 5307 /* analogously to handle_set_cr0 */ 5308 val = (val & ~vmcs12->cr4_guest_host_mask) | 5309 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5310 if (kvm_set_cr4(vcpu, val)) 5311 return 1; 5312 vmcs_writel(CR4_READ_SHADOW, orig_val); 5313 return 0; 5314 } else 5315 return kvm_set_cr4(vcpu, val); 5316 } 5317 5318 static int handle_desc(struct kvm_vcpu *vcpu) 5319 { 5320 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); 5321 return kvm_emulate_instruction(vcpu, 0); 5322 } 5323 5324 static int handle_cr(struct kvm_vcpu *vcpu) 5325 { 5326 unsigned long exit_qualification, val; 5327 int cr; 5328 int reg; 5329 int err; 5330 int ret; 5331 5332 exit_qualification = vmx_get_exit_qual(vcpu); 5333 cr = exit_qualification & 15; 5334 reg = (exit_qualification >> 8) & 15; 5335 switch ((exit_qualification >> 4) & 3) { 5336 case 0: /* mov to cr */ 5337 val = kvm_register_read(vcpu, reg); 5338 trace_kvm_cr_write(cr, val); 5339 switch (cr) { 5340 case 0: 5341 err = handle_set_cr0(vcpu, val); 5342 return kvm_complete_insn_gp(vcpu, err); 5343 case 3: 5344 WARN_ON_ONCE(enable_unrestricted_guest); 5345 5346 err = kvm_set_cr3(vcpu, val); 5347 return kvm_complete_insn_gp(vcpu, err); 5348 case 4: 5349 err = handle_set_cr4(vcpu, val); 5350 return kvm_complete_insn_gp(vcpu, err); 5351 case 8: { 5352 u8 cr8_prev = kvm_get_cr8(vcpu); 5353 u8 cr8 = (u8)val; 5354 err = kvm_set_cr8(vcpu, cr8); 5355 ret = kvm_complete_insn_gp(vcpu, err); 5356 if (lapic_in_kernel(vcpu)) 5357 return ret; 5358 if (cr8_prev <= cr8) 5359 return ret; 5360 /* 5361 * TODO: we might be squashing a 5362 * KVM_GUESTDBG_SINGLESTEP-triggered 5363 * KVM_EXIT_DEBUG here. 5364 */ 5365 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5366 return 0; 5367 } 5368 } 5369 break; 5370 case 2: /* clts */ 5371 KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); 5372 return -EIO; 5373 case 1: /*mov from cr*/ 5374 switch (cr) { 5375 case 3: 5376 WARN_ON_ONCE(enable_unrestricted_guest); 5377 5378 val = kvm_read_cr3(vcpu); 5379 kvm_register_write(vcpu, reg, val); 5380 trace_kvm_cr_read(cr, val); 5381 return kvm_skip_emulated_instruction(vcpu); 5382 case 8: 5383 val = kvm_get_cr8(vcpu); 5384 kvm_register_write(vcpu, reg, val); 5385 trace_kvm_cr_read(cr, val); 5386 return kvm_skip_emulated_instruction(vcpu); 5387 } 5388 break; 5389 case 3: /* lmsw */ 5390 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5391 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); 5392 kvm_lmsw(vcpu, val); 5393 5394 return kvm_skip_emulated_instruction(vcpu); 5395 default: 5396 break; 5397 } 5398 vcpu->run->exit_reason = 0; 5399 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5400 (int)(exit_qualification >> 4) & 3, cr); 5401 return 0; 5402 } 5403 5404 static int handle_dr(struct kvm_vcpu *vcpu) 5405 { 5406 unsigned long exit_qualification; 5407 int dr, dr7, reg; 5408 int err = 1; 5409 5410 exit_qualification = vmx_get_exit_qual(vcpu); 5411 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5412 5413 /* First, if DR does not exist, trigger UD */ 5414 if (!kvm_require_dr(vcpu, dr)) 5415 return 1; 5416 5417 if (vmx_get_cpl(vcpu) > 0) 5418 goto out; 5419 5420 dr7 = vmcs_readl(GUEST_DR7); 5421 if (dr7 & DR7_GD) { 5422 /* 5423 * As the vm-exit takes precedence over the debug trap, we 5424 * need to emulate the latter, either for the host or the 5425 * guest debugging itself. 5426 */ 5427 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5428 vcpu->run->debug.arch.dr6 = DR6_BD | DR6_ACTIVE_LOW; 5429 vcpu->run->debug.arch.dr7 = dr7; 5430 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5431 vcpu->run->debug.arch.exception = DB_VECTOR; 5432 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5433 return 0; 5434 } else { 5435 kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); 5436 return 1; 5437 } 5438 } 5439 5440 if (vcpu->guest_debug == 0) { 5441 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5442 5443 /* 5444 * No more DR vmexits; force a reload of the debug registers 5445 * and reenter on this instruction. The next vmexit will 5446 * retrieve the full state of the debug registers. 5447 */ 5448 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5449 return 1; 5450 } 5451 5452 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5453 if (exit_qualification & TYPE_MOV_FROM_DR) { 5454 unsigned long val; 5455 5456 kvm_get_dr(vcpu, dr, &val); 5457 kvm_register_write(vcpu, reg, val); 5458 err = 0; 5459 } else { 5460 err = kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg)); 5461 } 5462 5463 out: 5464 return kvm_complete_insn_gp(vcpu, err); 5465 } 5466 5467 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5468 { 5469 get_debugreg(vcpu->arch.db[0], 0); 5470 get_debugreg(vcpu->arch.db[1], 1); 5471 get_debugreg(vcpu->arch.db[2], 2); 5472 get_debugreg(vcpu->arch.db[3], 3); 5473 get_debugreg(vcpu->arch.dr6, 6); 5474 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5475 5476 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5477 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 5478 5479 /* 5480 * exc_debug expects dr6 to be cleared after it runs, avoid that it sees 5481 * a stale dr6 from the guest. 5482 */ 5483 set_debugreg(DR6_RESERVED, 6); 5484 } 5485 5486 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5487 { 5488 vmcs_writel(GUEST_DR7, val); 5489 } 5490 5491 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5492 { 5493 kvm_apic_update_ppr(vcpu); 5494 return 1; 5495 } 5496 5497 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5498 { 5499 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING); 5500 5501 kvm_make_request(KVM_REQ_EVENT, vcpu); 5502 5503 ++vcpu->stat.irq_window_exits; 5504 return 1; 5505 } 5506 5507 static int handle_invlpg(struct kvm_vcpu *vcpu) 5508 { 5509 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5510 5511 kvm_mmu_invlpg(vcpu, exit_qualification); 5512 return kvm_skip_emulated_instruction(vcpu); 5513 } 5514 5515 static int handle_apic_access(struct kvm_vcpu *vcpu) 5516 { 5517 if (likely(fasteoi)) { 5518 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5519 int access_type, offset; 5520 5521 access_type = exit_qualification & APIC_ACCESS_TYPE; 5522 offset = exit_qualification & APIC_ACCESS_OFFSET; 5523 /* 5524 * Sane guest uses MOV to write EOI, with written value 5525 * not cared. So make a short-circuit here by avoiding 5526 * heavy instruction emulation. 5527 */ 5528 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5529 (offset == APIC_EOI)) { 5530 kvm_lapic_set_eoi(vcpu); 5531 return kvm_skip_emulated_instruction(vcpu); 5532 } 5533 } 5534 return kvm_emulate_instruction(vcpu, 0); 5535 } 5536 5537 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5538 { 5539 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5540 int vector = exit_qualification & 0xff; 5541 5542 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5543 kvm_apic_set_eoi_accelerated(vcpu, vector); 5544 return 1; 5545 } 5546 5547 static int handle_apic_write(struct kvm_vcpu *vcpu) 5548 { 5549 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5550 5551 /* 5552 * APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and 5553 * hardware has done any necessary aliasing, offset adjustments, etc... 5554 * for the access. I.e. the correct value has already been written to 5555 * the vAPIC page for the correct 16-byte chunk. KVM needs only to 5556 * retrieve the register value and emulate the access. 5557 */ 5558 u32 offset = exit_qualification & 0xff0; 5559 5560 kvm_apic_write_nodecode(vcpu, offset); 5561 return 1; 5562 } 5563 5564 static int handle_task_switch(struct kvm_vcpu *vcpu) 5565 { 5566 struct vcpu_vmx *vmx = to_vmx(vcpu); 5567 unsigned long exit_qualification; 5568 bool has_error_code = false; 5569 u32 error_code = 0; 5570 u16 tss_selector; 5571 int reason, type, idt_v, idt_index; 5572 5573 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5574 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5575 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5576 5577 exit_qualification = vmx_get_exit_qual(vcpu); 5578 5579 reason = (u32)exit_qualification >> 30; 5580 if (reason == TASK_SWITCH_GATE && idt_v) { 5581 switch (type) { 5582 case INTR_TYPE_NMI_INTR: 5583 vcpu->arch.nmi_injected = false; 5584 vmx_set_nmi_mask(vcpu, true); 5585 break; 5586 case INTR_TYPE_EXT_INTR: 5587 case INTR_TYPE_SOFT_INTR: 5588 kvm_clear_interrupt_queue(vcpu); 5589 break; 5590 case INTR_TYPE_HARD_EXCEPTION: 5591 if (vmx->idt_vectoring_info & 5592 VECTORING_INFO_DELIVER_CODE_MASK) { 5593 has_error_code = true; 5594 error_code = 5595 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5596 } 5597 fallthrough; 5598 case INTR_TYPE_SOFT_EXCEPTION: 5599 kvm_clear_exception_queue(vcpu); 5600 break; 5601 default: 5602 break; 5603 } 5604 } 5605 tss_selector = exit_qualification; 5606 5607 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5608 type != INTR_TYPE_EXT_INTR && 5609 type != INTR_TYPE_NMI_INTR)) 5610 WARN_ON(!skip_emulated_instruction(vcpu)); 5611 5612 /* 5613 * TODO: What about debug traps on tss switch? 5614 * Are we supposed to inject them and update dr6? 5615 */ 5616 return kvm_task_switch(vcpu, tss_selector, 5617 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5618 reason, has_error_code, error_code); 5619 } 5620 5621 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5622 { 5623 unsigned long exit_qualification; 5624 gpa_t gpa; 5625 u64 error_code; 5626 5627 exit_qualification = vmx_get_exit_qual(vcpu); 5628 5629 /* 5630 * EPT violation happened while executing iret from NMI, 5631 * "blocked by NMI" bit has to be set before next VM entry. 5632 * There are errata that may cause this bit to not be set: 5633 * AAK134, BY25. 5634 */ 5635 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5636 enable_vnmi && 5637 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5638 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5639 5640 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5641 trace_kvm_page_fault(gpa, exit_qualification); 5642 5643 /* Is it a read fault? */ 5644 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) 5645 ? PFERR_USER_MASK : 0; 5646 /* Is it a write fault? */ 5647 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) 5648 ? PFERR_WRITE_MASK : 0; 5649 /* Is it a fetch fault? */ 5650 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) 5651 ? PFERR_FETCH_MASK : 0; 5652 /* ept page table entry is present? */ 5653 error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK) 5654 ? PFERR_PRESENT_MASK : 0; 5655 5656 error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ? 5657 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5658 5659 vcpu->arch.exit_qualification = exit_qualification; 5660 5661 /* 5662 * Check that the GPA doesn't exceed physical memory limits, as that is 5663 * a guest page fault. We have to emulate the instruction here, because 5664 * if the illegal address is that of a paging structure, then 5665 * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we 5666 * would also use advanced VM-exit information for EPT violations to 5667 * reconstruct the page fault error code. 5668 */ 5669 if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa))) 5670 return kvm_emulate_instruction(vcpu, 0); 5671 5672 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5673 } 5674 5675 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5676 { 5677 gpa_t gpa; 5678 5679 if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0)) 5680 return 1; 5681 5682 /* 5683 * A nested guest cannot optimize MMIO vmexits, because we have an 5684 * nGPA here instead of the required GPA. 5685 */ 5686 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5687 if (!is_guest_mode(vcpu) && 5688 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5689 trace_kvm_fast_mmio(gpa); 5690 return kvm_skip_emulated_instruction(vcpu); 5691 } 5692 5693 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5694 } 5695 5696 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5697 { 5698 if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) 5699 return -EIO; 5700 5701 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); 5702 ++vcpu->stat.nmi_window_exits; 5703 kvm_make_request(KVM_REQ_EVENT, vcpu); 5704 5705 return 1; 5706 } 5707 5708 static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) 5709 { 5710 struct vcpu_vmx *vmx = to_vmx(vcpu); 5711 5712 return vmx->emulation_required && !vmx->rmode.vm86_active && 5713 (vcpu->arch.exception.pending || vcpu->arch.exception.injected); 5714 } 5715 5716 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5717 { 5718 struct vcpu_vmx *vmx = to_vmx(vcpu); 5719 bool intr_window_requested; 5720 unsigned count = 130; 5721 5722 intr_window_requested = exec_controls_get(vmx) & 5723 CPU_BASED_INTR_WINDOW_EXITING; 5724 5725 while (vmx->emulation_required && count-- != 0) { 5726 if (intr_window_requested && !vmx_interrupt_blocked(vcpu)) 5727 return handle_interrupt_window(&vmx->vcpu); 5728 5729 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5730 return 1; 5731 5732 if (!kvm_emulate_instruction(vcpu, 0)) 5733 return 0; 5734 5735 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5736 kvm_prepare_emulation_failure_exit(vcpu); 5737 return 0; 5738 } 5739 5740 if (vcpu->arch.halt_request) { 5741 vcpu->arch.halt_request = 0; 5742 return kvm_emulate_halt_noskip(vcpu); 5743 } 5744 5745 /* 5746 * Note, return 1 and not 0, vcpu_run() will invoke 5747 * xfer_to_guest_mode() which will create a proper return 5748 * code. 5749 */ 5750 if (__xfer_to_guest_mode_work_pending()) 5751 return 1; 5752 } 5753 5754 return 1; 5755 } 5756 5757 static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu) 5758 { 5759 if (vmx_emulation_required_with_pending_exception(vcpu)) { 5760 kvm_prepare_emulation_failure_exit(vcpu); 5761 return 0; 5762 } 5763 5764 return 1; 5765 } 5766 5767 static void grow_ple_window(struct kvm_vcpu *vcpu) 5768 { 5769 struct vcpu_vmx *vmx = to_vmx(vcpu); 5770 unsigned int old = vmx->ple_window; 5771 5772 vmx->ple_window = __grow_ple_window(old, ple_window, 5773 ple_window_grow, 5774 ple_window_max); 5775 5776 if (vmx->ple_window != old) { 5777 vmx->ple_window_dirty = true; 5778 trace_kvm_ple_window_update(vcpu->vcpu_id, 5779 vmx->ple_window, old); 5780 } 5781 } 5782 5783 static void shrink_ple_window(struct kvm_vcpu *vcpu) 5784 { 5785 struct vcpu_vmx *vmx = to_vmx(vcpu); 5786 unsigned int old = vmx->ple_window; 5787 5788 vmx->ple_window = __shrink_ple_window(old, ple_window, 5789 ple_window_shrink, 5790 ple_window); 5791 5792 if (vmx->ple_window != old) { 5793 vmx->ple_window_dirty = true; 5794 trace_kvm_ple_window_update(vcpu->vcpu_id, 5795 vmx->ple_window, old); 5796 } 5797 } 5798 5799 /* 5800 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5801 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5802 */ 5803 static int handle_pause(struct kvm_vcpu *vcpu) 5804 { 5805 if (!kvm_pause_in_guest(vcpu->kvm)) 5806 grow_ple_window(vcpu); 5807 5808 /* 5809 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5810 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5811 * never set PAUSE_EXITING and just set PLE if supported, 5812 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5813 */ 5814 kvm_vcpu_on_spin(vcpu, true); 5815 return kvm_skip_emulated_instruction(vcpu); 5816 } 5817 5818 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5819 { 5820 return 1; 5821 } 5822 5823 static int handle_invpcid(struct kvm_vcpu *vcpu) 5824 { 5825 u32 vmx_instruction_info; 5826 unsigned long type; 5827 gva_t gva; 5828 struct { 5829 u64 pcid; 5830 u64 gla; 5831 } operand; 5832 int gpr_index; 5833 5834 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 5835 kvm_queue_exception(vcpu, UD_VECTOR); 5836 return 1; 5837 } 5838 5839 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5840 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5841 type = kvm_register_read(vcpu, gpr_index); 5842 5843 /* According to the Intel instruction reference, the memory operand 5844 * is read even if it isn't needed (e.g., for type==all) 5845 */ 5846 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5847 vmx_instruction_info, false, 5848 sizeof(operand), &gva)) 5849 return 1; 5850 5851 return kvm_handle_invpcid(vcpu, type, gva); 5852 } 5853 5854 static int handle_pml_full(struct kvm_vcpu *vcpu) 5855 { 5856 unsigned long exit_qualification; 5857 5858 trace_kvm_pml_full(vcpu->vcpu_id); 5859 5860 exit_qualification = vmx_get_exit_qual(vcpu); 5861 5862 /* 5863 * PML buffer FULL happened while executing iret from NMI, 5864 * "blocked by NMI" bit has to be set before next VM entry. 5865 */ 5866 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5867 enable_vnmi && 5868 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5869 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5870 GUEST_INTR_STATE_NMI); 5871 5872 /* 5873 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 5874 * here.., and there's no userspace involvement needed for PML. 5875 */ 5876 return 1; 5877 } 5878 5879 static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu) 5880 { 5881 struct vcpu_vmx *vmx = to_vmx(vcpu); 5882 5883 if (!vmx->req_immediate_exit && 5884 !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) { 5885 kvm_lapic_expired_hv_timer(vcpu); 5886 return EXIT_FASTPATH_REENTER_GUEST; 5887 } 5888 5889 return EXIT_FASTPATH_NONE; 5890 } 5891 5892 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 5893 { 5894 handle_fastpath_preemption_timer(vcpu); 5895 return 1; 5896 } 5897 5898 /* 5899 * When nested=0, all VMX instruction VM Exits filter here. The handlers 5900 * are overwritten by nested_vmx_setup() when nested=1. 5901 */ 5902 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 5903 { 5904 kvm_queue_exception(vcpu, UD_VECTOR); 5905 return 1; 5906 } 5907 5908 #ifndef CONFIG_X86_SGX_KVM 5909 static int handle_encls(struct kvm_vcpu *vcpu) 5910 { 5911 /* 5912 * SGX virtualization is disabled. There is no software enable bit for 5913 * SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent 5914 * the guest from executing ENCLS (when SGX is supported by hardware). 5915 */ 5916 kvm_queue_exception(vcpu, UD_VECTOR); 5917 return 1; 5918 } 5919 #endif /* CONFIG_X86_SGX_KVM */ 5920 5921 static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu) 5922 { 5923 /* 5924 * Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK 5925 * VM-Exits. Unconditionally set the flag here and leave the handling to 5926 * vmx_handle_exit(). 5927 */ 5928 to_vmx(vcpu)->exit_reason.bus_lock_detected = true; 5929 return 1; 5930 } 5931 5932 static int handle_notify(struct kvm_vcpu *vcpu) 5933 { 5934 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5935 bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID; 5936 5937 ++vcpu->stat.notify_window_exits; 5938 5939 /* 5940 * Notify VM exit happened while executing iret from NMI, 5941 * "blocked by NMI" bit has to be set before next VM entry. 5942 */ 5943 if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI)) 5944 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5945 GUEST_INTR_STATE_NMI); 5946 5947 if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER || 5948 context_invalid) { 5949 vcpu->run->exit_reason = KVM_EXIT_NOTIFY; 5950 vcpu->run->notify.flags = context_invalid ? 5951 KVM_NOTIFY_CONTEXT_INVALID : 0; 5952 return 0; 5953 } 5954 5955 return 1; 5956 } 5957 5958 /* 5959 * The exit handlers return 1 if the exit was handled fully and guest execution 5960 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5961 * to be done to userspace and return 0. 5962 */ 5963 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 5964 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 5965 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 5966 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 5967 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 5968 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 5969 [EXIT_REASON_CR_ACCESS] = handle_cr, 5970 [EXIT_REASON_DR_ACCESS] = handle_dr, 5971 [EXIT_REASON_CPUID] = kvm_emulate_cpuid, 5972 [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, 5973 [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, 5974 [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window, 5975 [EXIT_REASON_HLT] = kvm_emulate_halt, 5976 [EXIT_REASON_INVD] = kvm_emulate_invd, 5977 [EXIT_REASON_INVLPG] = handle_invlpg, 5978 [EXIT_REASON_RDPMC] = kvm_emulate_rdpmc, 5979 [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, 5980 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 5981 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 5982 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 5983 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 5984 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 5985 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 5986 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 5987 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 5988 [EXIT_REASON_VMON] = handle_vmx_instruction, 5989 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5990 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5991 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 5992 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 5993 [EXIT_REASON_WBINVD] = kvm_emulate_wbinvd, 5994 [EXIT_REASON_XSETBV] = kvm_emulate_xsetbv, 5995 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 5996 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 5997 [EXIT_REASON_GDTR_IDTR] = handle_desc, 5998 [EXIT_REASON_LDTR_TR] = handle_desc, 5999 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 6000 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 6001 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 6002 [EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait, 6003 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 6004 [EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor, 6005 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 6006 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 6007 [EXIT_REASON_RDRAND] = kvm_handle_invalid_op, 6008 [EXIT_REASON_RDSEED] = kvm_handle_invalid_op, 6009 [EXIT_REASON_PML_FULL] = handle_pml_full, 6010 [EXIT_REASON_INVPCID] = handle_invpcid, 6011 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 6012 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 6013 [EXIT_REASON_ENCLS] = handle_encls, 6014 [EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit, 6015 [EXIT_REASON_NOTIFY] = handle_notify, 6016 }; 6017 6018 static const int kvm_vmx_max_exit_handlers = 6019 ARRAY_SIZE(kvm_vmx_exit_handlers); 6020 6021 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 6022 u64 *info1, u64 *info2, 6023 u32 *intr_info, u32 *error_code) 6024 { 6025 struct vcpu_vmx *vmx = to_vmx(vcpu); 6026 6027 *reason = vmx->exit_reason.full; 6028 *info1 = vmx_get_exit_qual(vcpu); 6029 if (!(vmx->exit_reason.failed_vmentry)) { 6030 *info2 = vmx->idt_vectoring_info; 6031 *intr_info = vmx_get_intr_info(vcpu); 6032 if (is_exception_with_error_code(*intr_info)) 6033 *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6034 else 6035 *error_code = 0; 6036 } else { 6037 *info2 = 0; 6038 *intr_info = 0; 6039 *error_code = 0; 6040 } 6041 } 6042 6043 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 6044 { 6045 if (vmx->pml_pg) { 6046 __free_page(vmx->pml_pg); 6047 vmx->pml_pg = NULL; 6048 } 6049 } 6050 6051 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 6052 { 6053 struct vcpu_vmx *vmx = to_vmx(vcpu); 6054 u64 *pml_buf; 6055 u16 pml_idx; 6056 6057 pml_idx = vmcs_read16(GUEST_PML_INDEX); 6058 6059 /* Do nothing if PML buffer is empty */ 6060 if (pml_idx == (PML_ENTITY_NUM - 1)) 6061 return; 6062 6063 /* PML index always points to next available PML buffer entity */ 6064 if (pml_idx >= PML_ENTITY_NUM) 6065 pml_idx = 0; 6066 else 6067 pml_idx++; 6068 6069 pml_buf = page_address(vmx->pml_pg); 6070 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 6071 u64 gpa; 6072 6073 gpa = pml_buf[pml_idx]; 6074 WARN_ON(gpa & (PAGE_SIZE - 1)); 6075 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 6076 } 6077 6078 /* reset PML index */ 6079 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 6080 } 6081 6082 static void vmx_dump_sel(char *name, uint32_t sel) 6083 { 6084 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 6085 name, vmcs_read16(sel), 6086 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 6087 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 6088 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 6089 } 6090 6091 static void vmx_dump_dtsel(char *name, uint32_t limit) 6092 { 6093 pr_err("%s limit=0x%08x, base=0x%016lx\n", 6094 name, vmcs_read32(limit), 6095 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 6096 } 6097 6098 static void vmx_dump_msrs(char *name, struct vmx_msrs *m) 6099 { 6100 unsigned int i; 6101 struct vmx_msr_entry *e; 6102 6103 pr_err("MSR %s:\n", name); 6104 for (i = 0, e = m->val; i < m->nr; ++i, ++e) 6105 pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value); 6106 } 6107 6108 void dump_vmcs(struct kvm_vcpu *vcpu) 6109 { 6110 struct vcpu_vmx *vmx = to_vmx(vcpu); 6111 u32 vmentry_ctl, vmexit_ctl; 6112 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 6113 u64 tertiary_exec_control; 6114 unsigned long cr4; 6115 int efer_slot; 6116 6117 if (!dump_invalid_vmcs) { 6118 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 6119 return; 6120 } 6121 6122 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 6123 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 6124 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 6125 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 6126 cr4 = vmcs_readl(GUEST_CR4); 6127 6128 if (cpu_has_secondary_exec_ctrls()) 6129 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6130 else 6131 secondary_exec_control = 0; 6132 6133 if (cpu_has_tertiary_exec_ctrls()) 6134 tertiary_exec_control = vmcs_read64(TERTIARY_VM_EXEC_CONTROL); 6135 else 6136 tertiary_exec_control = 0; 6137 6138 pr_err("VMCS %p, last attempted VM-entry on CPU %d\n", 6139 vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu); 6140 pr_err("*** Guest State ***\n"); 6141 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6142 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 6143 vmcs_readl(CR0_GUEST_HOST_MASK)); 6144 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 6145 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 6146 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 6147 if (cpu_has_vmx_ept()) { 6148 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 6149 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 6150 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 6151 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 6152 } 6153 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 6154 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 6155 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 6156 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 6157 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6158 vmcs_readl(GUEST_SYSENTER_ESP), 6159 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 6160 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 6161 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 6162 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 6163 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 6164 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 6165 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 6166 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 6167 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 6168 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 6169 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 6170 efer_slot = vmx_find_loadstore_msr_slot(&vmx->msr_autoload.guest, MSR_EFER); 6171 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER) 6172 pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER)); 6173 else if (efer_slot >= 0) 6174 pr_err("EFER= 0x%016llx (autoload)\n", 6175 vmx->msr_autoload.guest.val[efer_slot].value); 6176 else if (vmentry_ctl & VM_ENTRY_IA32E_MODE) 6177 pr_err("EFER= 0x%016llx (effective)\n", 6178 vcpu->arch.efer | (EFER_LMA | EFER_LME)); 6179 else 6180 pr_err("EFER= 0x%016llx (effective)\n", 6181 vcpu->arch.efer & ~(EFER_LMA | EFER_LME)); 6182 if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT) 6183 pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT)); 6184 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 6185 vmcs_read64(GUEST_IA32_DEBUGCTL), 6186 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 6187 if (cpu_has_load_perf_global_ctrl() && 6188 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 6189 pr_err("PerfGlobCtl = 0x%016llx\n", 6190 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 6191 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 6192 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 6193 pr_err("Interruptibility = %08x ActivityState = %08x\n", 6194 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 6195 vmcs_read32(GUEST_ACTIVITY_STATE)); 6196 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 6197 pr_err("InterruptStatus = %04x\n", 6198 vmcs_read16(GUEST_INTR_STATUS)); 6199 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6200 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6201 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6202 vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6203 6204 pr_err("*** Host State ***\n"); 6205 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 6206 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 6207 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 6208 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 6209 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 6210 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 6211 vmcs_read16(HOST_TR_SELECTOR)); 6212 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 6213 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 6214 vmcs_readl(HOST_TR_BASE)); 6215 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 6216 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 6217 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 6218 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 6219 vmcs_readl(HOST_CR4)); 6220 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 6221 vmcs_readl(HOST_IA32_SYSENTER_ESP), 6222 vmcs_read32(HOST_IA32_SYSENTER_CS), 6223 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 6224 if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER) 6225 pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER)); 6226 if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT) 6227 pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT)); 6228 if (cpu_has_load_perf_global_ctrl() && 6229 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 6230 pr_err("PerfGlobCtl = 0x%016llx\n", 6231 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 6232 if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) 6233 vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); 6234 6235 pr_err("*** Control State ***\n"); 6236 pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", 6237 cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control); 6238 pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n", 6239 pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl); 6240 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 6241 vmcs_read32(EXCEPTION_BITMAP), 6242 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 6243 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 6244 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 6245 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6246 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 6247 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 6248 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 6249 vmcs_read32(VM_EXIT_INTR_INFO), 6250 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 6251 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 6252 pr_err(" reason=%08x qualification=%016lx\n", 6253 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 6254 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 6255 vmcs_read32(IDT_VECTORING_INFO_FIELD), 6256 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 6257 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 6258 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 6259 pr_err("TSC Multiplier = 0x%016llx\n", 6260 vmcs_read64(TSC_MULTIPLIER)); 6261 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 6262 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 6263 u16 status = vmcs_read16(GUEST_INTR_STATUS); 6264 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 6265 } 6266 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 6267 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 6268 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 6269 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 6270 } 6271 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 6272 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 6273 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 6274 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 6275 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 6276 pr_err("PLE Gap=%08x Window=%08x\n", 6277 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 6278 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 6279 pr_err("Virtual processor ID = 0x%04x\n", 6280 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 6281 } 6282 6283 /* 6284 * The guest has exited. See if we can fix it or if we need userspace 6285 * assistance. 6286 */ 6287 static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6288 { 6289 struct vcpu_vmx *vmx = to_vmx(vcpu); 6290 union vmx_exit_reason exit_reason = vmx->exit_reason; 6291 u32 vectoring_info = vmx->idt_vectoring_info; 6292 u16 exit_handler_index; 6293 6294 /* 6295 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 6296 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 6297 * querying dirty_bitmap, we only need to kick all vcpus out of guest 6298 * mode as if vcpus is in root mode, the PML buffer must has been 6299 * flushed already. Note, PML is never enabled in hardware while 6300 * running L2. 6301 */ 6302 if (enable_pml && !is_guest_mode(vcpu)) 6303 vmx_flush_pml_buffer(vcpu); 6304 6305 /* 6306 * KVM should never reach this point with a pending nested VM-Enter. 6307 * More specifically, short-circuiting VM-Entry to emulate L2 due to 6308 * invalid guest state should never happen as that means KVM knowingly 6309 * allowed a nested VM-Enter with an invalid vmcs12. More below. 6310 */ 6311 if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) 6312 return -EIO; 6313 6314 if (is_guest_mode(vcpu)) { 6315 /* 6316 * PML is never enabled when running L2, bail immediately if a 6317 * PML full exit occurs as something is horribly wrong. 6318 */ 6319 if (exit_reason.basic == EXIT_REASON_PML_FULL) 6320 goto unexpected_vmexit; 6321 6322 /* 6323 * The host physical addresses of some pages of guest memory 6324 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC 6325 * Page). The CPU may write to these pages via their host 6326 * physical address while L2 is running, bypassing any 6327 * address-translation-based dirty tracking (e.g. EPT write 6328 * protection). 6329 * 6330 * Mark them dirty on every exit from L2 to prevent them from 6331 * getting out of sync with dirty tracking. 6332 */ 6333 nested_mark_vmcs12_pages_dirty(vcpu); 6334 6335 /* 6336 * Synthesize a triple fault if L2 state is invalid. In normal 6337 * operation, nested VM-Enter rejects any attempt to enter L2 6338 * with invalid state. However, those checks are skipped if 6339 * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If 6340 * L2 state is invalid, it means either L1 modified SMRAM state 6341 * or userspace provided bad state. Synthesize TRIPLE_FAULT as 6342 * doing so is architecturally allowed in the RSM case, and is 6343 * the least awful solution for the userspace case without 6344 * risking false positives. 6345 */ 6346 if (vmx->emulation_required) { 6347 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 6348 return 1; 6349 } 6350 6351 if (nested_vmx_reflect_vmexit(vcpu)) 6352 return 1; 6353 } 6354 6355 /* If guest state is invalid, start emulating. L2 is handled above. */ 6356 if (vmx->emulation_required) 6357 return handle_invalid_guest_state(vcpu); 6358 6359 if (exit_reason.failed_vmentry) { 6360 dump_vmcs(vcpu); 6361 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6362 vcpu->run->fail_entry.hardware_entry_failure_reason 6363 = exit_reason.full; 6364 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6365 return 0; 6366 } 6367 6368 if (unlikely(vmx->fail)) { 6369 dump_vmcs(vcpu); 6370 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 6371 vcpu->run->fail_entry.hardware_entry_failure_reason 6372 = vmcs_read32(VM_INSTRUCTION_ERROR); 6373 vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 6374 return 0; 6375 } 6376 6377 /* 6378 * Note: 6379 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 6380 * delivery event since it indicates guest is accessing MMIO. 6381 * The vm-exit can be triggered again after return to guest that 6382 * will cause infinite loop. 6383 */ 6384 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6385 (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI && 6386 exit_reason.basic != EXIT_REASON_EPT_VIOLATION && 6387 exit_reason.basic != EXIT_REASON_PML_FULL && 6388 exit_reason.basic != EXIT_REASON_APIC_ACCESS && 6389 exit_reason.basic != EXIT_REASON_TASK_SWITCH && 6390 exit_reason.basic != EXIT_REASON_NOTIFY)) { 6391 int ndata = 3; 6392 6393 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6394 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 6395 vcpu->run->internal.data[0] = vectoring_info; 6396 vcpu->run->internal.data[1] = exit_reason.full; 6397 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; 6398 if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) { 6399 vcpu->run->internal.data[ndata++] = 6400 vmcs_read64(GUEST_PHYSICAL_ADDRESS); 6401 } 6402 vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu; 6403 vcpu->run->internal.ndata = ndata; 6404 return 0; 6405 } 6406 6407 if (unlikely(!enable_vnmi && 6408 vmx->loaded_vmcs->soft_vnmi_blocked)) { 6409 if (!vmx_interrupt_blocked(vcpu)) { 6410 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6411 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 6412 vcpu->arch.nmi_pending) { 6413 /* 6414 * This CPU don't support us in finding the end of an 6415 * NMI-blocked window if the guest runs with IRQs 6416 * disabled. So we pull the trigger after 1 s of 6417 * futile waiting, but inform the user about this. 6418 */ 6419 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 6420 "state on VCPU %d after 1 s timeout\n", 6421 __func__, vcpu->vcpu_id); 6422 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 6423 } 6424 } 6425 6426 if (exit_fastpath != EXIT_FASTPATH_NONE) 6427 return 1; 6428 6429 if (exit_reason.basic >= kvm_vmx_max_exit_handlers) 6430 goto unexpected_vmexit; 6431 #ifdef CONFIG_RETPOLINE 6432 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6433 return kvm_emulate_wrmsr(vcpu); 6434 else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER) 6435 return handle_preemption_timer(vcpu); 6436 else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW) 6437 return handle_interrupt_window(vcpu); 6438 else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6439 return handle_external_interrupt(vcpu); 6440 else if (exit_reason.basic == EXIT_REASON_HLT) 6441 return kvm_emulate_halt(vcpu); 6442 else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) 6443 return handle_ept_misconfig(vcpu); 6444 #endif 6445 6446 exit_handler_index = array_index_nospec((u16)exit_reason.basic, 6447 kvm_vmx_max_exit_handlers); 6448 if (!kvm_vmx_exit_handlers[exit_handler_index]) 6449 goto unexpected_vmexit; 6450 6451 return kvm_vmx_exit_handlers[exit_handler_index](vcpu); 6452 6453 unexpected_vmexit: 6454 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 6455 exit_reason.full); 6456 dump_vmcs(vcpu); 6457 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 6458 vcpu->run->internal.suberror = 6459 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 6460 vcpu->run->internal.ndata = 2; 6461 vcpu->run->internal.data[0] = exit_reason.full; 6462 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 6463 return 0; 6464 } 6465 6466 static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 6467 { 6468 int ret = __vmx_handle_exit(vcpu, exit_fastpath); 6469 6470 /* 6471 * Exit to user space when bus lock detected to inform that there is 6472 * a bus lock in guest. 6473 */ 6474 if (to_vmx(vcpu)->exit_reason.bus_lock_detected) { 6475 if (ret > 0) 6476 vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK; 6477 6478 vcpu->run->flags |= KVM_RUN_X86_BUS_LOCK; 6479 return 0; 6480 } 6481 return ret; 6482 } 6483 6484 /* 6485 * Software based L1D cache flush which is used when microcode providing 6486 * the cache control MSR is not loaded. 6487 * 6488 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 6489 * flush it is required to read in 64 KiB because the replacement algorithm 6490 * is not exactly LRU. This could be sized at runtime via topology 6491 * information but as all relevant affected CPUs have 32KiB L1D cache size 6492 * there is no point in doing so. 6493 */ 6494 static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) 6495 { 6496 int size = PAGE_SIZE << L1D_CACHE_ORDER; 6497 6498 /* 6499 * This code is only executed when the flush mode is 'cond' or 6500 * 'always' 6501 */ 6502 if (static_branch_likely(&vmx_l1d_flush_cond)) { 6503 bool flush_l1d; 6504 6505 /* 6506 * Clear the per-vcpu flush bit, it gets set again 6507 * either from vcpu_run() or from one of the unsafe 6508 * VMEXIT handlers. 6509 */ 6510 flush_l1d = vcpu->arch.l1tf_flush_l1d; 6511 vcpu->arch.l1tf_flush_l1d = false; 6512 6513 /* 6514 * Clear the per-cpu flush bit, it gets set again from 6515 * the interrupt handlers. 6516 */ 6517 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 6518 kvm_clear_cpu_l1tf_flush_l1d(); 6519 6520 if (!flush_l1d) 6521 return; 6522 } 6523 6524 vcpu->stat.l1d_flush++; 6525 6526 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6527 native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6528 return; 6529 } 6530 6531 asm volatile( 6532 /* First ensure the pages are in the TLB */ 6533 "xorl %%eax, %%eax\n" 6534 ".Lpopulate_tlb:\n\t" 6535 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6536 "addl $4096, %%eax\n\t" 6537 "cmpl %%eax, %[size]\n\t" 6538 "jne .Lpopulate_tlb\n\t" 6539 "xorl %%eax, %%eax\n\t" 6540 "cpuid\n\t" 6541 /* Now fill the cache */ 6542 "xorl %%eax, %%eax\n" 6543 ".Lfill_cache:\n" 6544 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6545 "addl $64, %%eax\n\t" 6546 "cmpl %%eax, %[size]\n\t" 6547 "jne .Lfill_cache\n\t" 6548 "lfence\n" 6549 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6550 [size] "r" (size) 6551 : "eax", "ebx", "ecx", "edx"); 6552 } 6553 6554 static void vmx_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6555 { 6556 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6557 int tpr_threshold; 6558 6559 if (is_guest_mode(vcpu) && 6560 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6561 return; 6562 6563 tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; 6564 if (is_guest_mode(vcpu)) 6565 to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; 6566 else 6567 vmcs_write32(TPR_THRESHOLD, tpr_threshold); 6568 } 6569 6570 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6571 { 6572 struct vcpu_vmx *vmx = to_vmx(vcpu); 6573 u32 sec_exec_control; 6574 6575 if (!lapic_in_kernel(vcpu)) 6576 return; 6577 6578 if (!flexpriority_enabled && 6579 !cpu_has_vmx_virtualize_x2apic_mode()) 6580 return; 6581 6582 /* Postpone execution until vmcs01 is the current VMCS. */ 6583 if (is_guest_mode(vcpu)) { 6584 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6585 return; 6586 } 6587 6588 sec_exec_control = secondary_exec_controls_get(vmx); 6589 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6590 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6591 6592 switch (kvm_get_apic_mode(vcpu)) { 6593 case LAPIC_MODE_INVALID: 6594 WARN_ONCE(true, "Invalid local APIC state"); 6595 break; 6596 case LAPIC_MODE_DISABLED: 6597 break; 6598 case LAPIC_MODE_XAPIC: 6599 if (flexpriority_enabled) { 6600 sec_exec_control |= 6601 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6602 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 6603 6604 /* 6605 * Flush the TLB, reloading the APIC access page will 6606 * only do so if its physical address has changed, but 6607 * the guest may have inserted a non-APIC mapping into 6608 * the TLB while the APIC access page was disabled. 6609 */ 6610 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 6611 } 6612 break; 6613 case LAPIC_MODE_X2APIC: 6614 if (cpu_has_vmx_virtualize_x2apic_mode()) 6615 sec_exec_control |= 6616 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6617 break; 6618 } 6619 secondary_exec_controls_set(vmx, sec_exec_control); 6620 6621 vmx_update_msr_bitmap_x2apic(vcpu); 6622 } 6623 6624 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) 6625 { 6626 struct page *page; 6627 6628 /* Defer reload until vmcs01 is the current VMCS. */ 6629 if (is_guest_mode(vcpu)) { 6630 to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; 6631 return; 6632 } 6633 6634 if (!(secondary_exec_controls_get(to_vmx(vcpu)) & 6635 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 6636 return; 6637 6638 page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); 6639 if (is_error_page(page)) 6640 return; 6641 6642 vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); 6643 vmx_flush_tlb_current(vcpu); 6644 6645 /* 6646 * Do not pin apic access page in memory, the MMU notifier 6647 * will call us again if it is migrated or swapped out. 6648 */ 6649 put_page(page); 6650 } 6651 6652 static void vmx_hwapic_isr_update(int max_isr) 6653 { 6654 u16 status; 6655 u8 old; 6656 6657 if (max_isr == -1) 6658 max_isr = 0; 6659 6660 status = vmcs_read16(GUEST_INTR_STATUS); 6661 old = status >> 8; 6662 if (max_isr != old) { 6663 status &= 0xff; 6664 status |= max_isr << 8; 6665 vmcs_write16(GUEST_INTR_STATUS, status); 6666 } 6667 } 6668 6669 static void vmx_set_rvi(int vector) 6670 { 6671 u16 status; 6672 u8 old; 6673 6674 if (vector == -1) 6675 vector = 0; 6676 6677 status = vmcs_read16(GUEST_INTR_STATUS); 6678 old = (u8)status & 0xff; 6679 if ((u8)vector != old) { 6680 status &= ~0xff; 6681 status |= (u8)vector; 6682 vmcs_write16(GUEST_INTR_STATUS, status); 6683 } 6684 } 6685 6686 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 6687 { 6688 /* 6689 * When running L2, updating RVI is only relevant when 6690 * vmcs12 virtual-interrupt-delivery enabled. 6691 * However, it can be enabled only when L1 also 6692 * intercepts external-interrupts and in that case 6693 * we should not update vmcs02 RVI but instead intercept 6694 * interrupt. Therefore, do nothing when running L2. 6695 */ 6696 if (!is_guest_mode(vcpu)) 6697 vmx_set_rvi(max_irr); 6698 } 6699 6700 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6701 { 6702 struct vcpu_vmx *vmx = to_vmx(vcpu); 6703 int max_irr; 6704 bool got_posted_interrupt; 6705 6706 if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) 6707 return -EIO; 6708 6709 if (pi_test_on(&vmx->pi_desc)) { 6710 pi_clear_on(&vmx->pi_desc); 6711 /* 6712 * IOMMU can write to PID.ON, so the barrier matters even on UP. 6713 * But on x86 this is just a compiler barrier anyway. 6714 */ 6715 smp_mb__after_atomic(); 6716 got_posted_interrupt = 6717 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); 6718 } else { 6719 max_irr = kvm_lapic_find_highest_irr(vcpu); 6720 got_posted_interrupt = false; 6721 } 6722 6723 /* 6724 * Newly recognized interrupts are injected via either virtual interrupt 6725 * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is 6726 * disabled in two cases: 6727 * 6728 * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 6729 * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a 6730 * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected 6731 * into L2, but KVM doesn't use virtual interrupt delivery to inject 6732 * interrupts into L2, and so KVM_REQ_EVENT is again needed. 6733 * 6734 * 2) If APICv is disabled for this vCPU, assigned devices may still 6735 * attempt to post interrupts. The posted interrupt vector will cause 6736 * a VM-Exit and the subsequent entry will call sync_pir_to_irr. 6737 */ 6738 if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) 6739 vmx_set_rvi(max_irr); 6740 else if (got_posted_interrupt) 6741 kvm_make_request(KVM_REQ_EVENT, vcpu); 6742 6743 return max_irr; 6744 } 6745 6746 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6747 { 6748 if (!kvm_vcpu_apicv_active(vcpu)) 6749 return; 6750 6751 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6752 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6753 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6754 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6755 } 6756 6757 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) 6758 { 6759 struct vcpu_vmx *vmx = to_vmx(vcpu); 6760 6761 pi_clear_on(&vmx->pi_desc); 6762 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); 6763 } 6764 6765 void vmx_do_interrupt_nmi_irqoff(unsigned long entry); 6766 6767 static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu, 6768 unsigned long entry) 6769 { 6770 bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist; 6771 6772 kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ); 6773 vmx_do_interrupt_nmi_irqoff(entry); 6774 kvm_after_interrupt(vcpu); 6775 } 6776 6777 static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu) 6778 { 6779 /* 6780 * Save xfd_err to guest_fpu before interrupt is enabled, so the 6781 * MSR value is not clobbered by the host activity before the guest 6782 * has chance to consume it. 6783 * 6784 * Do not blindly read xfd_err here, since this exception might 6785 * be caused by L1 interception on a platform which doesn't 6786 * support xfd at all. 6787 * 6788 * Do it conditionally upon guest_fpu::xfd. xfd_err matters 6789 * only when xfd contains a non-zero value. 6790 * 6791 * Queuing exception is done in vmx_handle_exit. See comment there. 6792 */ 6793 if (vcpu->arch.guest_fpu.fpstate->xfd) 6794 rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); 6795 } 6796 6797 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) 6798 { 6799 const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist; 6800 u32 intr_info = vmx_get_intr_info(&vmx->vcpu); 6801 6802 /* if exit due to PF check for async PF */ 6803 if (is_page_fault(intr_info)) 6804 vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); 6805 /* if exit due to NM, handle before interrupts are enabled */ 6806 else if (is_nm_fault(intr_info)) 6807 handle_nm_fault_irqoff(&vmx->vcpu); 6808 /* Handle machine checks before interrupts are enabled */ 6809 else if (is_machine_check(intr_info)) 6810 kvm_machine_check(); 6811 /* We need to handle NMIs before interrupts are enabled */ 6812 else if (is_nmi(intr_info)) 6813 handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry); 6814 } 6815 6816 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) 6817 { 6818 u32 intr_info = vmx_get_intr_info(vcpu); 6819 unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; 6820 gate_desc *desc = (gate_desc *)host_idt_base + vector; 6821 6822 if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, 6823 "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) 6824 return; 6825 6826 handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); 6827 vcpu->arch.at_instruction_boundary = true; 6828 } 6829 6830 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 6831 { 6832 struct vcpu_vmx *vmx = to_vmx(vcpu); 6833 6834 if (vmx->emulation_required) 6835 return; 6836 6837 if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) 6838 handle_external_interrupt_irqoff(vcpu); 6839 else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) 6840 handle_exception_nmi_irqoff(vmx); 6841 } 6842 6843 /* 6844 * The kvm parameter can be NULL (module initialization, or invocation before 6845 * VM creation). Be sure to check the kvm parameter before using it. 6846 */ 6847 static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index) 6848 { 6849 switch (index) { 6850 case MSR_IA32_SMBASE: 6851 /* 6852 * We cannot do SMM unless we can run the guest in big 6853 * real mode. 6854 */ 6855 return enable_unrestricted_guest || emulate_invalid_guest_state; 6856 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 6857 return nested; 6858 case MSR_AMD64_VIRT_SPEC_CTRL: 6859 case MSR_AMD64_TSC_RATIO: 6860 /* This is AMD only. */ 6861 return false; 6862 default: 6863 return true; 6864 } 6865 } 6866 6867 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 6868 { 6869 u32 exit_intr_info; 6870 bool unblock_nmi; 6871 u8 vector; 6872 bool idtv_info_valid; 6873 6874 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6875 6876 if (enable_vnmi) { 6877 if (vmx->loaded_vmcs->nmi_known_unmasked) 6878 return; 6879 6880 exit_intr_info = vmx_get_intr_info(&vmx->vcpu); 6881 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 6882 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 6883 /* 6884 * SDM 3: 27.7.1.2 (September 2008) 6885 * Re-set bit "block by NMI" before VM entry if vmexit caused by 6886 * a guest IRET fault. 6887 * SDM 3: 23.2.2 (September 2008) 6888 * Bit 12 is undefined in any of the following cases: 6889 * If the VM exit sets the valid bit in the IDT-vectoring 6890 * information field. 6891 * If the VM exit is due to a double fault. 6892 */ 6893 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 6894 vector != DF_VECTOR && !idtv_info_valid) 6895 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6896 GUEST_INTR_STATE_NMI); 6897 else 6898 vmx->loaded_vmcs->nmi_known_unmasked = 6899 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 6900 & GUEST_INTR_STATE_NMI); 6901 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 6902 vmx->loaded_vmcs->vnmi_blocked_time += 6903 ktime_to_ns(ktime_sub(ktime_get(), 6904 vmx->loaded_vmcs->entry_time)); 6905 } 6906 6907 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 6908 u32 idt_vectoring_info, 6909 int instr_len_field, 6910 int error_code_field) 6911 { 6912 u8 vector; 6913 int type; 6914 bool idtv_info_valid; 6915 6916 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6917 6918 vcpu->arch.nmi_injected = false; 6919 kvm_clear_exception_queue(vcpu); 6920 kvm_clear_interrupt_queue(vcpu); 6921 6922 if (!idtv_info_valid) 6923 return; 6924 6925 kvm_make_request(KVM_REQ_EVENT, vcpu); 6926 6927 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 6928 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 6929 6930 switch (type) { 6931 case INTR_TYPE_NMI_INTR: 6932 vcpu->arch.nmi_injected = true; 6933 /* 6934 * SDM 3: 27.7.1.2 (September 2008) 6935 * Clear bit "block by NMI" before VM entry if a NMI 6936 * delivery faulted. 6937 */ 6938 vmx_set_nmi_mask(vcpu, false); 6939 break; 6940 case INTR_TYPE_SOFT_EXCEPTION: 6941 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 6942 fallthrough; 6943 case INTR_TYPE_HARD_EXCEPTION: 6944 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 6945 u32 err = vmcs_read32(error_code_field); 6946 kvm_requeue_exception_e(vcpu, vector, err); 6947 } else 6948 kvm_requeue_exception(vcpu, vector); 6949 break; 6950 case INTR_TYPE_SOFT_INTR: 6951 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 6952 fallthrough; 6953 case INTR_TYPE_EXT_INTR: 6954 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 6955 break; 6956 default: 6957 break; 6958 } 6959 } 6960 6961 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 6962 { 6963 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 6964 VM_EXIT_INSTRUCTION_LEN, 6965 IDT_VECTORING_ERROR_CODE); 6966 } 6967 6968 static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 6969 { 6970 __vmx_complete_interrupts(vcpu, 6971 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6972 VM_ENTRY_INSTRUCTION_LEN, 6973 VM_ENTRY_EXCEPTION_ERROR_CODE); 6974 6975 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 6976 } 6977 6978 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 6979 { 6980 int i, nr_msrs; 6981 struct perf_guest_switch_msr *msrs; 6982 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 6983 6984 pmu->host_cross_mapped_mask = 0; 6985 if (pmu->pebs_enable & pmu->global_ctrl) 6986 intel_pmu_cross_mapped_check(pmu); 6987 6988 /* Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. */ 6989 msrs = perf_guest_get_msrs(&nr_msrs, (void *)pmu); 6990 if (!msrs) 6991 return; 6992 6993 for (i = 0; i < nr_msrs; i++) 6994 if (msrs[i].host == msrs[i].guest) 6995 clear_atomic_switch_msr(vmx, msrs[i].msr); 6996 else 6997 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 6998 msrs[i].host, false); 6999 } 7000 7001 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) 7002 { 7003 struct vcpu_vmx *vmx = to_vmx(vcpu); 7004 u64 tscl; 7005 u32 delta_tsc; 7006 7007 if (vmx->req_immediate_exit) { 7008 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 7009 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7010 } else if (vmx->hv_deadline_tsc != -1) { 7011 tscl = rdtsc(); 7012 if (vmx->hv_deadline_tsc > tscl) 7013 /* set_hv_timer ensures the delta fits in 32-bits */ 7014 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 7015 cpu_preemption_timer_multi); 7016 else 7017 delta_tsc = 0; 7018 7019 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 7020 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 7021 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 7022 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 7023 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 7024 } 7025 } 7026 7027 void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 7028 { 7029 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 7030 vmx->loaded_vmcs->host_state.rsp = host_rsp; 7031 vmcs_writel(HOST_RSP, host_rsp); 7032 } 7033 } 7034 7035 void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, 7036 unsigned int flags) 7037 { 7038 u64 hostval = this_cpu_read(x86_spec_ctrl_current); 7039 7040 if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) 7041 return; 7042 7043 if (flags & VMX_RUN_SAVE_SPEC_CTRL) 7044 vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL); 7045 7046 /* 7047 * If the guest/host SPEC_CTRL values differ, restore the host value. 7048 * 7049 * For legacy IBRS, the IBRS bit always needs to be written after 7050 * transitioning from a less privileged predictor mode, regardless of 7051 * whether the guest/host values differ. 7052 */ 7053 if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) || 7054 vmx->spec_ctrl != hostval) 7055 native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval); 7056 7057 barrier_nospec(); 7058 } 7059 7060 static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 7061 { 7062 switch (to_vmx(vcpu)->exit_reason.basic) { 7063 case EXIT_REASON_MSR_WRITE: 7064 return handle_fastpath_set_msr_irqoff(vcpu); 7065 case EXIT_REASON_PREEMPTION_TIMER: 7066 return handle_fastpath_preemption_timer(vcpu); 7067 default: 7068 return EXIT_FASTPATH_NONE; 7069 } 7070 } 7071 7072 static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, 7073 struct vcpu_vmx *vmx, 7074 unsigned long flags) 7075 { 7076 guest_state_enter_irqoff(); 7077 7078 /* L1D Flush includes CPU buffer clear to mitigate MDS */ 7079 if (static_branch_unlikely(&vmx_l1d_should_flush)) 7080 vmx_l1d_flush(vcpu); 7081 else if (static_branch_unlikely(&mds_user_clear)) 7082 mds_clear_cpu_buffers(); 7083 else if (static_branch_unlikely(&mmio_stale_data_clear) && 7084 kvm_arch_has_assigned_device(vcpu->kvm)) 7085 mds_clear_cpu_buffers(); 7086 7087 vmx_disable_fb_clear(vmx); 7088 7089 if (vcpu->arch.cr2 != native_read_cr2()) 7090 native_write_cr2(vcpu->arch.cr2); 7091 7092 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 7093 flags); 7094 7095 vcpu->arch.cr2 = native_read_cr2(); 7096 7097 vmx_enable_fb_clear(vmx); 7098 7099 guest_state_exit_irqoff(); 7100 } 7101 7102 static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) 7103 { 7104 struct vcpu_vmx *vmx = to_vmx(vcpu); 7105 unsigned long cr3, cr4; 7106 7107 /* Record the guest's net vcpu time for enforced NMI injections. */ 7108 if (unlikely(!enable_vnmi && 7109 vmx->loaded_vmcs->soft_vnmi_blocked)) 7110 vmx->loaded_vmcs->entry_time = ktime_get(); 7111 7112 /* 7113 * Don't enter VMX if guest state is invalid, let the exit handler 7114 * start emulation until we arrive back to a valid state. Synthesize a 7115 * consistency check VM-Exit due to invalid guest state and bail. 7116 */ 7117 if (unlikely(vmx->emulation_required)) { 7118 vmx->fail = 0; 7119 7120 vmx->exit_reason.full = EXIT_REASON_INVALID_STATE; 7121 vmx->exit_reason.failed_vmentry = 1; 7122 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); 7123 vmx->exit_qualification = ENTRY_FAIL_DEFAULT; 7124 kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); 7125 vmx->exit_intr_info = 0; 7126 return EXIT_FASTPATH_NONE; 7127 } 7128 7129 trace_kvm_entry(vcpu); 7130 7131 if (vmx->ple_window_dirty) { 7132 vmx->ple_window_dirty = false; 7133 vmcs_write32(PLE_WINDOW, vmx->ple_window); 7134 } 7135 7136 /* 7137 * We did this in prepare_switch_to_guest, because it needs to 7138 * be within srcu_read_lock. 7139 */ 7140 WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync); 7141 7142 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) 7143 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 7144 if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) 7145 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 7146 vcpu->arch.regs_dirty = 0; 7147 7148 /* 7149 * Refresh vmcs.HOST_CR3 if necessary. This must be done immediately 7150 * prior to VM-Enter, as the kernel may load a new ASID (PCID) any time 7151 * it switches back to the current->mm, which can occur in KVM context 7152 * when switching to a temporary mm to patch kernel code, e.g. if KVM 7153 * toggles a static key while handling a VM-Exit. 7154 */ 7155 cr3 = __get_current_cr3_fast(); 7156 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 7157 vmcs_writel(HOST_CR3, cr3); 7158 vmx->loaded_vmcs->host_state.cr3 = cr3; 7159 } 7160 7161 cr4 = cr4_read_shadow(); 7162 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 7163 vmcs_writel(HOST_CR4, cr4); 7164 vmx->loaded_vmcs->host_state.cr4 = cr4; 7165 } 7166 7167 /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ 7168 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) 7169 set_debugreg(vcpu->arch.dr6, 6); 7170 7171 /* When single-stepping over STI and MOV SS, we must clear the 7172 * corresponding interruptibility bits in the guest state. Otherwise 7173 * vmentry fails as it then expects bit 14 (BS) in pending debug 7174 * exceptions being set, but that's not correct for the guest debugging 7175 * case. */ 7176 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 7177 vmx_set_interrupt_shadow(vcpu, 0); 7178 7179 kvm_load_guest_xsave_state(vcpu); 7180 7181 pt_guest_enter(vmx); 7182 7183 atomic_switch_perf_msrs(vmx); 7184 if (intel_pmu_lbr_is_enabled(vcpu)) 7185 vmx_passthrough_lbr_msrs(vcpu); 7186 7187 if (enable_preemption_timer) 7188 vmx_update_hv_timer(vcpu); 7189 7190 kvm_wait_lapic_expire(vcpu); 7191 7192 /* The actual VMENTER/EXIT is in the .noinstr.text section. */ 7193 vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx)); 7194 7195 /* All fields are clean at this point */ 7196 if (static_branch_unlikely(&enable_evmcs)) { 7197 current_evmcs->hv_clean_fields |= 7198 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 7199 7200 current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu); 7201 } 7202 7203 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 7204 if (vmx->host_debugctlmsr) 7205 update_debugctlmsr(vmx->host_debugctlmsr); 7206 7207 #ifndef CONFIG_X86_64 7208 /* 7209 * The sysexit path does not restore ds/es, so we must set them to 7210 * a reasonable value ourselves. 7211 * 7212 * We can't defer this to vmx_prepare_switch_to_host() since that 7213 * function may be executed in interrupt context, which saves and 7214 * restore segments around it, nullifying its effect. 7215 */ 7216 loadsegment(ds, __USER_DS); 7217 loadsegment(es, __USER_DS); 7218 #endif 7219 7220 vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET; 7221 7222 pt_guest_exit(vmx); 7223 7224 kvm_load_host_xsave_state(vcpu); 7225 7226 if (is_guest_mode(vcpu)) { 7227 /* 7228 * Track VMLAUNCH/VMRESUME that have made past guest state 7229 * checking. 7230 */ 7231 if (vmx->nested.nested_run_pending && 7232 !vmx->exit_reason.failed_vmentry) 7233 ++vcpu->stat.nested_run; 7234 7235 vmx->nested.nested_run_pending = 0; 7236 } 7237 7238 vmx->idt_vectoring_info = 0; 7239 7240 if (unlikely(vmx->fail)) { 7241 vmx->exit_reason.full = 0xdead; 7242 return EXIT_FASTPATH_NONE; 7243 } 7244 7245 vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); 7246 if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY)) 7247 kvm_machine_check(); 7248 7249 if (likely(!vmx->exit_reason.failed_vmentry)) 7250 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 7251 7252 trace_kvm_exit(vcpu, KVM_ISA_VMX); 7253 7254 if (unlikely(vmx->exit_reason.failed_vmentry)) 7255 return EXIT_FASTPATH_NONE; 7256 7257 vmx->loaded_vmcs->launched = 1; 7258 7259 vmx_recover_nmi_blocking(vmx); 7260 vmx_complete_interrupts(vmx); 7261 7262 if (is_guest_mode(vcpu)) 7263 return EXIT_FASTPATH_NONE; 7264 7265 return vmx_exit_handlers_fastpath(vcpu); 7266 } 7267 7268 static void vmx_vcpu_free(struct kvm_vcpu *vcpu) 7269 { 7270 struct vcpu_vmx *vmx = to_vmx(vcpu); 7271 7272 if (enable_pml) 7273 vmx_destroy_pml_buffer(vmx); 7274 free_vpid(vmx->vpid); 7275 nested_vmx_free_vcpu(vcpu); 7276 free_loaded_vmcs(vmx->loaded_vmcs); 7277 } 7278 7279 static int vmx_vcpu_create(struct kvm_vcpu *vcpu) 7280 { 7281 struct vmx_uret_msr *tsx_ctrl; 7282 struct vcpu_vmx *vmx; 7283 int i, err; 7284 7285 BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0); 7286 vmx = to_vmx(vcpu); 7287 7288 INIT_LIST_HEAD(&vmx->pi_wakeup_list); 7289 7290 err = -ENOMEM; 7291 7292 vmx->vpid = allocate_vpid(); 7293 7294 /* 7295 * If PML is turned on, failure on enabling PML just results in failure 7296 * of creating the vcpu, therefore we can simplify PML logic (by 7297 * avoiding dealing with cases, such as enabling PML partially on vcpus 7298 * for the guest), etc. 7299 */ 7300 if (enable_pml) { 7301 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 7302 if (!vmx->pml_pg) 7303 goto free_vpid; 7304 } 7305 7306 for (i = 0; i < kvm_nr_uret_msrs; ++i) 7307 vmx->guest_uret_msrs[i].mask = -1ull; 7308 if (boot_cpu_has(X86_FEATURE_RTM)) { 7309 /* 7310 * TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception. 7311 * Keep the host value unchanged to avoid changing CPUID bits 7312 * under the host kernel's feet. 7313 */ 7314 tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7315 if (tsx_ctrl) 7316 tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR; 7317 } 7318 7319 err = alloc_loaded_vmcs(&vmx->vmcs01); 7320 if (err < 0) 7321 goto free_pml; 7322 7323 /* 7324 * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a 7325 * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the 7326 * feature only for vmcs01, KVM currently isn't equipped to realize any 7327 * performance benefits from enabling it for vmcs02. 7328 */ 7329 if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && 7330 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 7331 struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; 7332 7333 evmcs->hv_enlightenments_control.msr_bitmap = 1; 7334 } 7335 7336 /* The MSR bitmap starts with all ones */ 7337 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7338 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); 7339 7340 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R); 7341 #ifdef CONFIG_X86_64 7342 vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW); 7343 vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW); 7344 vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 7345 #endif 7346 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 7347 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 7348 vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 7349 if (kvm_cstate_in_guest(vcpu->kvm)) { 7350 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R); 7351 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 7352 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 7353 vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 7354 } 7355 7356 vmx->loaded_vmcs = &vmx->vmcs01; 7357 7358 if (cpu_need_virtualize_apic_accesses(vcpu)) { 7359 err = alloc_apic_access_page(vcpu->kvm); 7360 if (err) 7361 goto free_vmcs; 7362 } 7363 7364 if (enable_ept && !enable_unrestricted_guest) { 7365 err = init_rmode_identity_map(vcpu->kvm); 7366 if (err) 7367 goto free_vmcs; 7368 } 7369 7370 if (vmx_can_use_ipiv(vcpu)) 7371 WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id], 7372 __pa(&vmx->pi_desc) | PID_TABLE_ENTRY_VALID); 7373 7374 return 0; 7375 7376 free_vmcs: 7377 free_loaded_vmcs(vmx->loaded_vmcs); 7378 free_pml: 7379 vmx_destroy_pml_buffer(vmx); 7380 free_vpid: 7381 free_vpid(vmx->vpid); 7382 return err; 7383 } 7384 7385 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7386 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 7387 7388 static int vmx_vm_init(struct kvm *kvm) 7389 { 7390 if (!ple_gap) 7391 kvm->arch.pause_in_guest = true; 7392 7393 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 7394 switch (l1tf_mitigation) { 7395 case L1TF_MITIGATION_OFF: 7396 case L1TF_MITIGATION_FLUSH_NOWARN: 7397 /* 'I explicitly don't care' is set */ 7398 break; 7399 case L1TF_MITIGATION_FLUSH: 7400 case L1TF_MITIGATION_FLUSH_NOSMT: 7401 case L1TF_MITIGATION_FULL: 7402 /* 7403 * Warn upon starting the first VM in a potentially 7404 * insecure environment. 7405 */ 7406 if (sched_smt_active()) 7407 pr_warn_once(L1TF_MSG_SMT); 7408 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 7409 pr_warn_once(L1TF_MSG_L1D); 7410 break; 7411 case L1TF_MITIGATION_FULL_FORCE: 7412 /* Flush is enforced */ 7413 break; 7414 } 7415 } 7416 return 0; 7417 } 7418 7419 static int __init vmx_check_processor_compat(void) 7420 { 7421 struct vmcs_config vmcs_conf; 7422 struct vmx_capability vmx_cap; 7423 7424 if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || 7425 !this_cpu_has(X86_FEATURE_VMX)) { 7426 pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id()); 7427 return -EIO; 7428 } 7429 7430 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) 7431 return -EIO; 7432 if (nested) 7433 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept); 7434 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { 7435 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", 7436 smp_processor_id()); 7437 return -EIO; 7438 } 7439 return 0; 7440 } 7441 7442 static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 7443 { 7444 u8 cache; 7445 7446 /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in 7447 * memory aliases with conflicting memory types and sometimes MCEs. 7448 * We have to be careful as to what are honored and when. 7449 * 7450 * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to 7451 * UC. The effective memory type is UC or WC depending on guest PAT. 7452 * This was historically the source of MCEs and we want to be 7453 * conservative. 7454 * 7455 * When there is no need to deal with noncoherent DMA (e.g., no VT-d 7456 * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The 7457 * EPT memory type is set to WB. The effective memory type is forced 7458 * WB. 7459 * 7460 * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The 7461 * EPT memory type is used to emulate guest CD/MTRR. 7462 */ 7463 7464 if (is_mmio) 7465 return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 7466 7467 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) 7468 return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7469 7470 if (kvm_read_cr0(vcpu) & X86_CR0_CD) { 7471 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 7472 cache = MTRR_TYPE_WRBACK; 7473 else 7474 cache = MTRR_TYPE_UNCACHABLE; 7475 7476 return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT; 7477 } 7478 7479 return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT; 7480 } 7481 7482 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) 7483 { 7484 /* 7485 * These bits in the secondary execution controls field 7486 * are dynamic, the others are mostly based on the hypervisor 7487 * architecture and the guest's CPUID. Do not touch the 7488 * dynamic bits. 7489 */ 7490 u32 mask = 7491 SECONDARY_EXEC_SHADOW_VMCS | 7492 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7493 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 7494 SECONDARY_EXEC_DESC; 7495 7496 u32 cur_ctl = secondary_exec_controls_get(vmx); 7497 7498 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 7499 } 7500 7501 /* 7502 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 7503 * (indicating "allowed-1") if they are supported in the guest's CPUID. 7504 */ 7505 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 7506 { 7507 struct vcpu_vmx *vmx = to_vmx(vcpu); 7508 struct kvm_cpuid_entry2 *entry; 7509 7510 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 7511 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 7512 7513 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 7514 if (entry && (entry->_reg & (_cpuid_mask))) \ 7515 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 7516 } while (0) 7517 7518 entry = kvm_find_cpuid_entry(vcpu, 0x1); 7519 cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME)); 7520 cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME)); 7521 cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC)); 7522 cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE)); 7523 cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE)); 7524 cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE)); 7525 cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE)); 7526 cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE)); 7527 cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR)); 7528 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM)); 7529 cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX)); 7530 cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX)); 7531 cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID)); 7532 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE)); 7533 7534 entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 0); 7535 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE)); 7536 cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP)); 7537 cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP)); 7538 cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); 7539 cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); 7540 cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); 7541 7542 #undef cr4_fixed1_update 7543 } 7544 7545 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7546 { 7547 struct vcpu_vmx *vmx = to_vmx(vcpu); 7548 struct kvm_cpuid_entry2 *best = NULL; 7549 int i; 7550 7551 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7552 best = kvm_find_cpuid_entry_index(vcpu, 0x14, i); 7553 if (!best) 7554 return; 7555 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7556 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7557 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7558 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7559 } 7560 7561 /* Get the number of configurable Address Ranges for filtering */ 7562 vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(vmx->pt_desc.caps, 7563 PT_CAP_num_address_ranges); 7564 7565 /* Initialize and clear the no dependency bits */ 7566 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7567 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC | 7568 RTIT_CTL_BRANCH_EN); 7569 7570 /* 7571 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7572 * will inject an #GP 7573 */ 7574 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7575 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7576 7577 /* 7578 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7579 * PSBFreq can be set 7580 */ 7581 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7582 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7583 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7584 7585 /* 7586 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set 7587 */ 7588 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7589 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7590 RTIT_CTL_MTC_RANGE); 7591 7592 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7593 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7594 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7595 RTIT_CTL_PTW_EN); 7596 7597 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7598 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7599 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7600 7601 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7602 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7603 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7604 7605 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */ 7606 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7607 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7608 7609 /* unmask address range configure area */ 7610 for (i = 0; i < vmx->pt_desc.num_address_ranges; i++) 7611 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7612 } 7613 7614 static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 7615 { 7616 struct vcpu_vmx *vmx = to_vmx(vcpu); 7617 7618 /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ 7619 vcpu->arch.xsaves_enabled = false; 7620 7621 vmx_setup_uret_msrs(vmx); 7622 7623 if (cpu_has_secondary_exec_ctrls()) 7624 vmcs_set_secondary_exec_control(vmx, 7625 vmx_secondary_exec_control(vmx)); 7626 7627 if (nested_vmx_allowed(vcpu)) 7628 vmx->msr_ia32_feature_control_valid_bits |= 7629 FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7630 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 7631 else 7632 vmx->msr_ia32_feature_control_valid_bits &= 7633 ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX | 7634 FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX); 7635 7636 if (nested_vmx_allowed(vcpu)) 7637 nested_vmx_cr_fixed1_bits_update(vcpu); 7638 7639 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7640 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) 7641 update_intel_pt_cfg(vcpu); 7642 7643 if (boot_cpu_has(X86_FEATURE_RTM)) { 7644 struct vmx_uret_msr *msr; 7645 msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL); 7646 if (msr) { 7647 bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); 7648 vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); 7649 } 7650 } 7651 7652 if (kvm_cpu_cap_has(X86_FEATURE_XFD)) 7653 vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R, 7654 !guest_cpuid_has(vcpu, X86_FEATURE_XFD)); 7655 7656 7657 set_cr4_guest_host_mask(vmx); 7658 7659 vmx_write_encls_bitmap(vcpu, NULL); 7660 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX)) 7661 vmx->msr_ia32_feature_control_valid_bits |= FEAT_CTL_SGX_ENABLED; 7662 else 7663 vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED; 7664 7665 if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) 7666 vmx->msr_ia32_feature_control_valid_bits |= 7667 FEAT_CTL_SGX_LC_ENABLED; 7668 else 7669 vmx->msr_ia32_feature_control_valid_bits &= 7670 ~FEAT_CTL_SGX_LC_ENABLED; 7671 7672 /* Refresh #PF interception to account for MAXPHYADDR changes. */ 7673 vmx_update_exception_bitmap(vcpu); 7674 } 7675 7676 static __init void vmx_set_cpu_caps(void) 7677 { 7678 kvm_set_cpu_caps(); 7679 7680 /* CPUID 0x1 */ 7681 if (nested) 7682 kvm_cpu_cap_set(X86_FEATURE_VMX); 7683 7684 /* CPUID 0x7 */ 7685 if (kvm_mpx_supported()) 7686 kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); 7687 if (!cpu_has_vmx_invpcid()) 7688 kvm_cpu_cap_clear(X86_FEATURE_INVPCID); 7689 if (vmx_pt_mode_is_host_guest()) 7690 kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); 7691 if (vmx_pebs_supported()) { 7692 kvm_cpu_cap_check_and_set(X86_FEATURE_DS); 7693 kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64); 7694 } 7695 7696 if (!enable_pmu) 7697 kvm_cpu_cap_clear(X86_FEATURE_PDCM); 7698 7699 if (!enable_sgx) { 7700 kvm_cpu_cap_clear(X86_FEATURE_SGX); 7701 kvm_cpu_cap_clear(X86_FEATURE_SGX_LC); 7702 kvm_cpu_cap_clear(X86_FEATURE_SGX1); 7703 kvm_cpu_cap_clear(X86_FEATURE_SGX2); 7704 } 7705 7706 if (vmx_umip_emulated()) 7707 kvm_cpu_cap_set(X86_FEATURE_UMIP); 7708 7709 /* CPUID 0xD.1 */ 7710 kvm_caps.supported_xss = 0; 7711 if (!cpu_has_vmx_xsaves()) 7712 kvm_cpu_cap_clear(X86_FEATURE_XSAVES); 7713 7714 /* CPUID 0x80000001 and 0x7 (RDPID) */ 7715 if (!cpu_has_vmx_rdtscp()) { 7716 kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); 7717 kvm_cpu_cap_clear(X86_FEATURE_RDPID); 7718 } 7719 7720 if (cpu_has_vmx_waitpkg()) 7721 kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); 7722 } 7723 7724 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) 7725 { 7726 to_vmx(vcpu)->req_immediate_exit = true; 7727 } 7728 7729 static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, 7730 struct x86_instruction_info *info) 7731 { 7732 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7733 unsigned short port; 7734 bool intercept; 7735 int size; 7736 7737 if (info->intercept == x86_intercept_in || 7738 info->intercept == x86_intercept_ins) { 7739 port = info->src_val; 7740 size = info->dst_bytes; 7741 } else { 7742 port = info->dst_val; 7743 size = info->src_bytes; 7744 } 7745 7746 /* 7747 * If the 'use IO bitmaps' VM-execution control is 0, IO instruction 7748 * VM-exits depend on the 'unconditional IO exiting' VM-execution 7749 * control. 7750 * 7751 * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. 7752 */ 7753 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 7754 intercept = nested_cpu_has(vmcs12, 7755 CPU_BASED_UNCOND_IO_EXITING); 7756 else 7757 intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); 7758 7759 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 7760 return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; 7761 } 7762 7763 static int vmx_check_intercept(struct kvm_vcpu *vcpu, 7764 struct x86_instruction_info *info, 7765 enum x86_intercept_stage stage, 7766 struct x86_exception *exception) 7767 { 7768 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7769 7770 switch (info->intercept) { 7771 /* 7772 * RDPID causes #UD if disabled through secondary execution controls. 7773 * Because it is marked as EmulateOnUD, we need to intercept it here. 7774 * Note, RDPID is hidden behind ENABLE_RDTSCP. 7775 */ 7776 case x86_intercept_rdpid: 7777 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) { 7778 exception->vector = UD_VECTOR; 7779 exception->error_code_valid = false; 7780 return X86EMUL_PROPAGATE_FAULT; 7781 } 7782 break; 7783 7784 case x86_intercept_in: 7785 case x86_intercept_ins: 7786 case x86_intercept_out: 7787 case x86_intercept_outs: 7788 return vmx_check_intercept_io(vcpu, info); 7789 7790 case x86_intercept_lgdt: 7791 case x86_intercept_lidt: 7792 case x86_intercept_lldt: 7793 case x86_intercept_ltr: 7794 case x86_intercept_sgdt: 7795 case x86_intercept_sidt: 7796 case x86_intercept_sldt: 7797 case x86_intercept_str: 7798 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC)) 7799 return X86EMUL_CONTINUE; 7800 7801 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ 7802 break; 7803 7804 /* TODO: check more intercepts... */ 7805 default: 7806 break; 7807 } 7808 7809 return X86EMUL_UNHANDLEABLE; 7810 } 7811 7812 #ifdef CONFIG_X86_64 7813 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 7814 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 7815 u64 divisor, u64 *result) 7816 { 7817 u64 low = a << shift, high = a >> (64 - shift); 7818 7819 /* To avoid the overflow on divq */ 7820 if (high >= divisor) 7821 return 1; 7822 7823 /* Low hold the result, high hold rem which is discarded */ 7824 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 7825 "rm" (divisor), "0" (low), "1" (high)); 7826 *result = low; 7827 7828 return 0; 7829 } 7830 7831 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 7832 bool *expired) 7833 { 7834 struct vcpu_vmx *vmx; 7835 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 7836 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 7837 7838 vmx = to_vmx(vcpu); 7839 tscl = rdtsc(); 7840 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 7841 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 7842 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 7843 ktimer->timer_advance_ns); 7844 7845 if (delta_tsc > lapic_timer_advance_cycles) 7846 delta_tsc -= lapic_timer_advance_cycles; 7847 else 7848 delta_tsc = 0; 7849 7850 /* Convert to host delta tsc if tsc scaling is enabled */ 7851 if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio && 7852 delta_tsc && u64_shl_div_u64(delta_tsc, 7853 kvm_caps.tsc_scaling_ratio_frac_bits, 7854 vcpu->arch.l1_tsc_scaling_ratio, &delta_tsc)) 7855 return -ERANGE; 7856 7857 /* 7858 * If the delta tsc can't fit in the 32 bit after the multi shift, 7859 * we can't use the preemption timer. 7860 * It's possible that it fits on later vmentries, but checking 7861 * on every vmentry is costly so we just use an hrtimer. 7862 */ 7863 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 7864 return -ERANGE; 7865 7866 vmx->hv_deadline_tsc = tscl + delta_tsc; 7867 *expired = !delta_tsc; 7868 return 0; 7869 } 7870 7871 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 7872 { 7873 to_vmx(vcpu)->hv_deadline_tsc = -1; 7874 } 7875 #endif 7876 7877 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 7878 { 7879 if (!kvm_pause_in_guest(vcpu->kvm)) 7880 shrink_ple_window(vcpu); 7881 } 7882 7883 void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu) 7884 { 7885 struct vcpu_vmx *vmx = to_vmx(vcpu); 7886 7887 if (is_guest_mode(vcpu)) { 7888 vmx->nested.update_vmcs01_cpu_dirty_logging = true; 7889 return; 7890 } 7891 7892 /* 7893 * Note, cpu_dirty_logging_count can be changed concurrent with this 7894 * code, but in that case another update request will be made and so 7895 * the guest will never run with a stale PML value. 7896 */ 7897 if (vcpu->kvm->arch.cpu_dirty_logging_count) 7898 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML); 7899 else 7900 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML); 7901 } 7902 7903 static void vmx_setup_mce(struct kvm_vcpu *vcpu) 7904 { 7905 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 7906 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 7907 FEAT_CTL_LMCE_ENABLED; 7908 else 7909 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 7910 ~FEAT_CTL_LMCE_ENABLED; 7911 } 7912 7913 static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 7914 { 7915 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 7916 if (to_vmx(vcpu)->nested.nested_run_pending) 7917 return -EBUSY; 7918 return !is_smm(vcpu); 7919 } 7920 7921 static int vmx_enter_smm(struct kvm_vcpu *vcpu, char *smstate) 7922 { 7923 struct vcpu_vmx *vmx = to_vmx(vcpu); 7924 7925 /* 7926 * TODO: Implement custom flows for forcing the vCPU out/in of L2 on 7927 * SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong 7928 * SMI and RSM only modify state that is saved and restored via SMRAM. 7929 * E.g. most MSRs are left untouched, but many are modified by VM-Exit 7930 * and VM-Enter, and thus L2's values may be corrupted on SMI+RSM. 7931 */ 7932 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 7933 if (vmx->nested.smm.guest_mode) 7934 nested_vmx_vmexit(vcpu, -1, 0, 0); 7935 7936 vmx->nested.smm.vmxon = vmx->nested.vmxon; 7937 vmx->nested.vmxon = false; 7938 vmx_clear_hlt(vcpu); 7939 return 0; 7940 } 7941 7942 static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) 7943 { 7944 struct vcpu_vmx *vmx = to_vmx(vcpu); 7945 int ret; 7946 7947 if (vmx->nested.smm.vmxon) { 7948 vmx->nested.vmxon = true; 7949 vmx->nested.smm.vmxon = false; 7950 } 7951 7952 if (vmx->nested.smm.guest_mode) { 7953 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7954 if (ret) 7955 return ret; 7956 7957 vmx->nested.nested_run_pending = 1; 7958 vmx->nested.smm.guest_mode = false; 7959 } 7960 return 0; 7961 } 7962 7963 static void vmx_enable_smi_window(struct kvm_vcpu *vcpu) 7964 { 7965 /* RSM will cause a vmexit anyway. */ 7966 } 7967 7968 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 7969 { 7970 return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu); 7971 } 7972 7973 static void vmx_migrate_timers(struct kvm_vcpu *vcpu) 7974 { 7975 if (is_guest_mode(vcpu)) { 7976 struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; 7977 7978 if (hrtimer_try_to_cancel(timer) == 1) 7979 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); 7980 } 7981 } 7982 7983 static void vmx_hardware_unsetup(void) 7984 { 7985 kvm_set_posted_intr_wakeup_handler(NULL); 7986 7987 if (nested) 7988 nested_vmx_hardware_unsetup(); 7989 7990 free_kvm_area(); 7991 } 7992 7993 static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason) 7994 { 7995 ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | 7996 BIT(APICV_INHIBIT_REASON_ABSENT) | 7997 BIT(APICV_INHIBIT_REASON_HYPERV) | 7998 BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | 7999 BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | 8000 BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED); 8001 8002 return supported & BIT(reason); 8003 } 8004 8005 static void vmx_vm_destroy(struct kvm *kvm) 8006 { 8007 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 8008 8009 free_pages((unsigned long)kvm_vmx->pid_table, vmx_get_pid_table_order(kvm)); 8010 } 8011 8012 static struct kvm_x86_ops vmx_x86_ops __initdata = { 8013 .name = "kvm_intel", 8014 8015 .hardware_unsetup = vmx_hardware_unsetup, 8016 8017 .hardware_enable = vmx_hardware_enable, 8018 .hardware_disable = vmx_hardware_disable, 8019 .has_emulated_msr = vmx_has_emulated_msr, 8020 8021 .vm_size = sizeof(struct kvm_vmx), 8022 .vm_init = vmx_vm_init, 8023 .vm_destroy = vmx_vm_destroy, 8024 8025 .vcpu_precreate = vmx_vcpu_precreate, 8026 .vcpu_create = vmx_vcpu_create, 8027 .vcpu_free = vmx_vcpu_free, 8028 .vcpu_reset = vmx_vcpu_reset, 8029 8030 .prepare_switch_to_guest = vmx_prepare_switch_to_guest, 8031 .vcpu_load = vmx_vcpu_load, 8032 .vcpu_put = vmx_vcpu_put, 8033 8034 .update_exception_bitmap = vmx_update_exception_bitmap, 8035 .get_msr_feature = vmx_get_msr_feature, 8036 .get_msr = vmx_get_msr, 8037 .set_msr = vmx_set_msr, 8038 .get_segment_base = vmx_get_segment_base, 8039 .get_segment = vmx_get_segment, 8040 .set_segment = vmx_set_segment, 8041 .get_cpl = vmx_get_cpl, 8042 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 8043 .set_cr0 = vmx_set_cr0, 8044 .is_valid_cr4 = vmx_is_valid_cr4, 8045 .set_cr4 = vmx_set_cr4, 8046 .set_efer = vmx_set_efer, 8047 .get_idt = vmx_get_idt, 8048 .set_idt = vmx_set_idt, 8049 .get_gdt = vmx_get_gdt, 8050 .set_gdt = vmx_set_gdt, 8051 .set_dr7 = vmx_set_dr7, 8052 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, 8053 .cache_reg = vmx_cache_reg, 8054 .get_rflags = vmx_get_rflags, 8055 .set_rflags = vmx_set_rflags, 8056 .get_if_flag = vmx_get_if_flag, 8057 8058 .flush_tlb_all = vmx_flush_tlb_all, 8059 .flush_tlb_current = vmx_flush_tlb_current, 8060 .flush_tlb_gva = vmx_flush_tlb_gva, 8061 .flush_tlb_guest = vmx_flush_tlb_guest, 8062 8063 .vcpu_pre_run = vmx_vcpu_pre_run, 8064 .vcpu_run = vmx_vcpu_run, 8065 .handle_exit = vmx_handle_exit, 8066 .skip_emulated_instruction = vmx_skip_emulated_instruction, 8067 .update_emulated_instruction = vmx_update_emulated_instruction, 8068 .set_interrupt_shadow = vmx_set_interrupt_shadow, 8069 .get_interrupt_shadow = vmx_get_interrupt_shadow, 8070 .patch_hypercall = vmx_patch_hypercall, 8071 .inject_irq = vmx_inject_irq, 8072 .inject_nmi = vmx_inject_nmi, 8073 .queue_exception = vmx_queue_exception, 8074 .cancel_injection = vmx_cancel_injection, 8075 .interrupt_allowed = vmx_interrupt_allowed, 8076 .nmi_allowed = vmx_nmi_allowed, 8077 .get_nmi_mask = vmx_get_nmi_mask, 8078 .set_nmi_mask = vmx_set_nmi_mask, 8079 .enable_nmi_window = vmx_enable_nmi_window, 8080 .enable_irq_window = vmx_enable_irq_window, 8081 .update_cr8_intercept = vmx_update_cr8_intercept, 8082 .set_virtual_apic_mode = vmx_set_virtual_apic_mode, 8083 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 8084 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, 8085 .load_eoi_exitmap = vmx_load_eoi_exitmap, 8086 .apicv_post_state_restore = vmx_apicv_post_state_restore, 8087 .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, 8088 .hwapic_irr_update = vmx_hwapic_irr_update, 8089 .hwapic_isr_update = vmx_hwapic_isr_update, 8090 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, 8091 .sync_pir_to_irr = vmx_sync_pir_to_irr, 8092 .deliver_interrupt = vmx_deliver_interrupt, 8093 .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, 8094 8095 .set_tss_addr = vmx_set_tss_addr, 8096 .set_identity_map_addr = vmx_set_identity_map_addr, 8097 .get_mt_mask = vmx_get_mt_mask, 8098 8099 .get_exit_info = vmx_get_exit_info, 8100 8101 .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, 8102 8103 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 8104 8105 .get_l2_tsc_offset = vmx_get_l2_tsc_offset, 8106 .get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier, 8107 .write_tsc_offset = vmx_write_tsc_offset, 8108 .write_tsc_multiplier = vmx_write_tsc_multiplier, 8109 8110 .load_mmu_pgd = vmx_load_mmu_pgd, 8111 8112 .check_intercept = vmx_check_intercept, 8113 .handle_exit_irqoff = vmx_handle_exit_irqoff, 8114 8115 .request_immediate_exit = vmx_request_immediate_exit, 8116 8117 .sched_in = vmx_sched_in, 8118 8119 .cpu_dirty_log_size = PML_ENTITY_NUM, 8120 .update_cpu_dirty_logging = vmx_update_cpu_dirty_logging, 8121 8122 .nested_ops = &vmx_nested_ops, 8123 8124 .pi_update_irte = vmx_pi_update_irte, 8125 .pi_start_assignment = vmx_pi_start_assignment, 8126 8127 #ifdef CONFIG_X86_64 8128 .set_hv_timer = vmx_set_hv_timer, 8129 .cancel_hv_timer = vmx_cancel_hv_timer, 8130 #endif 8131 8132 .setup_mce = vmx_setup_mce, 8133 8134 .smi_allowed = vmx_smi_allowed, 8135 .enter_smm = vmx_enter_smm, 8136 .leave_smm = vmx_leave_smm, 8137 .enable_smi_window = vmx_enable_smi_window, 8138 8139 .can_emulate_instruction = vmx_can_emulate_instruction, 8140 .apic_init_signal_blocked = vmx_apic_init_signal_blocked, 8141 .migrate_timers = vmx_migrate_timers, 8142 8143 .msr_filter_changed = vmx_msr_filter_changed, 8144 .complete_emulated_msr = kvm_complete_insn_gp, 8145 8146 .vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector, 8147 }; 8148 8149 static unsigned int vmx_handle_intel_pt_intr(void) 8150 { 8151 struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 8152 8153 /* '0' on failure so that the !PT case can use a RET0 static call. */ 8154 if (!vcpu || !kvm_handling_nmi_from_guest(vcpu)) 8155 return 0; 8156 8157 kvm_make_request(KVM_REQ_PMI, vcpu); 8158 __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, 8159 (unsigned long *)&vcpu->arch.pmu.global_status); 8160 return 1; 8161 } 8162 8163 static __init void vmx_setup_user_return_msrs(void) 8164 { 8165 8166 /* 8167 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 8168 * will emulate SYSCALL in legacy mode if the vendor string in guest 8169 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 8170 * support this emulation, MSR_STAR is included in the list for i386, 8171 * but is never loaded into hardware. MSR_CSTAR is also never loaded 8172 * into hardware and is here purely for emulation purposes. 8173 */ 8174 const u32 vmx_uret_msrs_list[] = { 8175 #ifdef CONFIG_X86_64 8176 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 8177 #endif 8178 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 8179 MSR_IA32_TSX_CTRL, 8180 }; 8181 int i; 8182 8183 BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS); 8184 8185 for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) 8186 kvm_add_user_return_msr(vmx_uret_msrs_list[i]); 8187 } 8188 8189 static void __init vmx_setup_me_spte_mask(void) 8190 { 8191 u64 me_mask = 0; 8192 8193 /* 8194 * kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use 8195 * the former to avoid exposing shadow_phys_bits. 8196 * 8197 * On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to 8198 * shadow_phys_bits. On MKTME and/or TDX capable systems, 8199 * boot_cpu_data.x86_phys_bits holds the actual physical address 8200 * w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR 8201 * reported by CPUID. Those bits between are KeyID bits. 8202 */ 8203 if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits()) 8204 me_mask = rsvd_bits(boot_cpu_data.x86_phys_bits, 8205 kvm_get_shadow_phys_bits() - 1); 8206 /* 8207 * Unlike SME, host kernel doesn't support setting up any 8208 * MKTME KeyID on Intel platforms. No memory encryption 8209 * bits should be included into the SPTE. 8210 */ 8211 kvm_mmu_set_me_spte_mask(0, me_mask); 8212 } 8213 8214 static struct kvm_x86_init_ops vmx_init_ops __initdata; 8215 8216 static __init int hardware_setup(void) 8217 { 8218 unsigned long host_bndcfgs; 8219 struct desc_ptr dt; 8220 int r; 8221 8222 store_idt(&dt); 8223 host_idt_base = dt.address; 8224 8225 vmx_setup_user_return_msrs(); 8226 8227 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 8228 return -EIO; 8229 8230 if (boot_cpu_has(X86_FEATURE_NX)) 8231 kvm_enable_efer_bits(EFER_NX); 8232 8233 if (boot_cpu_has(X86_FEATURE_MPX)) { 8234 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); 8235 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); 8236 } 8237 8238 if (!cpu_has_vmx_mpx()) 8239 kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | 8240 XFEATURE_MASK_BNDCSR); 8241 8242 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 8243 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 8244 enable_vpid = 0; 8245 8246 if (!cpu_has_vmx_ept() || 8247 !cpu_has_vmx_ept_4levels() || 8248 !cpu_has_vmx_ept_mt_wb() || 8249 !cpu_has_vmx_invept_global()) 8250 enable_ept = 0; 8251 8252 /* NX support is required for shadow paging. */ 8253 if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) { 8254 pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n"); 8255 return -EOPNOTSUPP; 8256 } 8257 8258 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 8259 enable_ept_ad_bits = 0; 8260 8261 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 8262 enable_unrestricted_guest = 0; 8263 8264 if (!cpu_has_vmx_flexpriority()) 8265 flexpriority_enabled = 0; 8266 8267 if (!cpu_has_virtual_nmis()) 8268 enable_vnmi = 0; 8269 8270 /* 8271 * set_apic_access_page_addr() is used to reload apic access 8272 * page upon invalidation. No need to do anything if not 8273 * using the APIC_ACCESS_ADDR VMCS field. 8274 */ 8275 if (!flexpriority_enabled) 8276 vmx_x86_ops.set_apic_access_page_addr = NULL; 8277 8278 if (!cpu_has_vmx_tpr_shadow()) 8279 vmx_x86_ops.update_cr8_intercept = NULL; 8280 8281 #if IS_ENABLED(CONFIG_HYPERV) 8282 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 8283 && enable_ept) { 8284 vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; 8285 vmx_x86_ops.tlb_remote_flush_with_range = 8286 hv_remote_flush_tlb_with_range; 8287 } 8288 #endif 8289 8290 if (!cpu_has_vmx_ple()) { 8291 ple_gap = 0; 8292 ple_window = 0; 8293 ple_window_grow = 0; 8294 ple_window_max = 0; 8295 ple_window_shrink = 0; 8296 } 8297 8298 if (!cpu_has_vmx_apicv()) 8299 enable_apicv = 0; 8300 if (!enable_apicv) 8301 vmx_x86_ops.sync_pir_to_irr = NULL; 8302 8303 if (!enable_apicv || !cpu_has_vmx_ipiv()) 8304 enable_ipiv = false; 8305 8306 if (cpu_has_vmx_tsc_scaling()) 8307 kvm_caps.has_tsc_control = true; 8308 8309 kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 8310 kvm_caps.tsc_scaling_ratio_frac_bits = 48; 8311 kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection(); 8312 kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit(); 8313 8314 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 8315 8316 if (enable_ept) 8317 kvm_mmu_set_ept_masks(enable_ept_ad_bits, 8318 cpu_has_vmx_ept_execute_only()); 8319 8320 /* 8321 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID 8322 * bits to shadow_zero_check. 8323 */ 8324 vmx_setup_me_spte_mask(); 8325 8326 kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), 8327 ept_caps_to_lpage_level(vmx_capability.ept)); 8328 8329 /* 8330 * Only enable PML when hardware supports PML feature, and both EPT 8331 * and EPT A/D bit features are enabled -- PML depends on them to work. 8332 */ 8333 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 8334 enable_pml = 0; 8335 8336 if (!enable_pml) 8337 vmx_x86_ops.cpu_dirty_log_size = 0; 8338 8339 if (!cpu_has_vmx_preemption_timer()) 8340 enable_preemption_timer = false; 8341 8342 if (enable_preemption_timer) { 8343 u64 use_timer_freq = 5000ULL * 1000 * 1000; 8344 u64 vmx_msr; 8345 8346 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); 8347 cpu_preemption_timer_multi = 8348 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 8349 8350 if (tsc_khz) 8351 use_timer_freq = (u64)tsc_khz * 1000; 8352 use_timer_freq >>= cpu_preemption_timer_multi; 8353 8354 /* 8355 * KVM "disables" the preemption timer by setting it to its max 8356 * value. Don't use the timer if it might cause spurious exits 8357 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 8358 */ 8359 if (use_timer_freq > 0xffffffffu / 10) 8360 enable_preemption_timer = false; 8361 } 8362 8363 if (!enable_preemption_timer) { 8364 vmx_x86_ops.set_hv_timer = NULL; 8365 vmx_x86_ops.cancel_hv_timer = NULL; 8366 vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; 8367 } 8368 8369 kvm_caps.supported_mce_cap |= MCG_LMCE_P; 8370 kvm_caps.supported_mce_cap |= MCG_CMCI_P; 8371 8372 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 8373 return -EINVAL; 8374 if (!enable_ept || !enable_pmu || !cpu_has_vmx_intel_pt()) 8375 pt_mode = PT_MODE_SYSTEM; 8376 if (pt_mode == PT_MODE_HOST_GUEST) 8377 vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr; 8378 else 8379 vmx_init_ops.handle_intel_pt_intr = NULL; 8380 8381 setup_default_sgx_lepubkeyhash(); 8382 8383 if (nested) { 8384 nested_vmx_setup_ctls_msrs(&vmcs_config.nested, 8385 vmx_capability.ept); 8386 8387 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 8388 if (r) 8389 return r; 8390 } 8391 8392 vmx_set_cpu_caps(); 8393 8394 r = alloc_kvm_area(); 8395 if (r && nested) 8396 nested_vmx_hardware_unsetup(); 8397 8398 kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler); 8399 8400 return r; 8401 } 8402 8403 static struct kvm_x86_init_ops vmx_init_ops __initdata = { 8404 .cpu_has_kvm_support = cpu_has_kvm_support, 8405 .disabled_by_bios = vmx_disabled_by_bios, 8406 .check_processor_compatibility = vmx_check_processor_compat, 8407 .hardware_setup = hardware_setup, 8408 .handle_intel_pt_intr = NULL, 8409 8410 .runtime_ops = &vmx_x86_ops, 8411 .pmu_ops = &intel_pmu_ops, 8412 }; 8413 8414 static void vmx_cleanup_l1d_flush(void) 8415 { 8416 if (vmx_l1d_flush_pages) { 8417 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 8418 vmx_l1d_flush_pages = NULL; 8419 } 8420 /* Restore state so sysfs ignores VMX */ 8421 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 8422 } 8423 8424 static void vmx_exit(void) 8425 { 8426 #ifdef CONFIG_KEXEC_CORE 8427 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); 8428 synchronize_rcu(); 8429 #endif 8430 8431 kvm_exit(); 8432 8433 #if IS_ENABLED(CONFIG_HYPERV) 8434 if (static_branch_unlikely(&enable_evmcs)) { 8435 int cpu; 8436 struct hv_vp_assist_page *vp_ap; 8437 /* 8438 * Reset everything to support using non-enlightened VMCS 8439 * access later (e.g. when we reload the module with 8440 * enlightened_vmcs=0) 8441 */ 8442 for_each_online_cpu(cpu) { 8443 vp_ap = hv_get_vp_assist_page(cpu); 8444 8445 if (!vp_ap) 8446 continue; 8447 8448 vp_ap->nested_control.features.directhypercall = 0; 8449 vp_ap->current_nested_vmcs = 0; 8450 vp_ap->enlighten_vmentry = 0; 8451 } 8452 8453 static_branch_disable(&enable_evmcs); 8454 } 8455 #endif 8456 vmx_cleanup_l1d_flush(); 8457 8458 allow_smaller_maxphyaddr = false; 8459 } 8460 module_exit(vmx_exit); 8461 8462 static int __init vmx_init(void) 8463 { 8464 int r, cpu; 8465 8466 #if IS_ENABLED(CONFIG_HYPERV) 8467 /* 8468 * Enlightened VMCS usage should be recommended and the host needs 8469 * to support eVMCS v1 or above. We can also disable eVMCS support 8470 * with module parameter. 8471 */ 8472 if (enlightened_vmcs && 8473 ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 8474 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 8475 KVM_EVMCS_VERSION) { 8476 8477 /* Check that we have assist pages on all online CPUs */ 8478 for_each_online_cpu(cpu) { 8479 if (!hv_get_vp_assist_page(cpu)) { 8480 enlightened_vmcs = false; 8481 break; 8482 } 8483 } 8484 8485 if (enlightened_vmcs) { 8486 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); 8487 static_branch_enable(&enable_evmcs); 8488 } 8489 8490 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 8491 vmx_x86_ops.enable_direct_tlbflush 8492 = hv_enable_direct_tlbflush; 8493 8494 } else { 8495 enlightened_vmcs = false; 8496 } 8497 #endif 8498 8499 r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), 8500 __alignof__(struct vcpu_vmx), THIS_MODULE); 8501 if (r) 8502 return r; 8503 8504 /* 8505 * Must be called after kvm_init() so enable_ept is properly set 8506 * up. Hand the parameter mitigation value in which was stored in 8507 * the pre module init parser. If no parameter was given, it will 8508 * contain 'auto' which will be turned into the default 'cond' 8509 * mitigation mode. 8510 */ 8511 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8512 if (r) { 8513 vmx_exit(); 8514 return r; 8515 } 8516 8517 vmx_setup_fb_clear_ctrl(); 8518 8519 for_each_possible_cpu(cpu) { 8520 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 8521 8522 pi_init_cpu(cpu); 8523 } 8524 8525 #ifdef CONFIG_KEXEC_CORE 8526 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 8527 crash_vmclear_local_loaded_vmcss); 8528 #endif 8529 vmx_check_vmcs12_offsets(); 8530 8531 /* 8532 * Shadow paging doesn't have a (further) performance penalty 8533 * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it 8534 * by default 8535 */ 8536 if (!enable_ept) 8537 allow_smaller_maxphyaddr = true; 8538 8539 return 0; 8540 } 8541 module_init(vmx_init); 8542