1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16 #include <linux/frame.h> 17 #include <linux/highmem.h> 18 #include <linux/hrtimer.h> 19 #include <linux/kernel.h> 20 #include <linux/kvm_host.h> 21 #include <linux/module.h> 22 #include <linux/moduleparam.h> 23 #include <linux/mod_devicetable.h> 24 #include <linux/mm.h> 25 #include <linux/sched.h> 26 #include <linux/sched/smt.h> 27 #include <linux/slab.h> 28 #include <linux/tboot.h> 29 #include <linux/trace_events.h> 30 31 #include <asm/apic.h> 32 #include <asm/asm.h> 33 #include <asm/cpu.h> 34 #include <asm/debugreg.h> 35 #include <asm/desc.h> 36 #include <asm/fpu/internal.h> 37 #include <asm/io.h> 38 #include <asm/irq_remapping.h> 39 #include <asm/kexec.h> 40 #include <asm/perf_event.h> 41 #include <asm/mce.h> 42 #include <asm/mmu_context.h> 43 #include <asm/mshyperv.h> 44 #include <asm/spec-ctrl.h> 45 #include <asm/virtext.h> 46 #include <asm/vmx.h> 47 48 #include "capabilities.h" 49 #include "cpuid.h" 50 #include "evmcs.h" 51 #include "irq.h" 52 #include "kvm_cache_regs.h" 53 #include "lapic.h" 54 #include "mmu.h" 55 #include "nested.h" 56 #include "ops.h" 57 #include "pmu.h" 58 #include "trace.h" 59 #include "vmcs.h" 60 #include "vmcs12.h" 61 #include "vmx.h" 62 #include "x86.h" 63 64 MODULE_AUTHOR("Qumranet"); 65 MODULE_LICENSE("GPL"); 66 67 static const struct x86_cpu_id vmx_cpu_id[] = { 68 X86_FEATURE_MATCH(X86_FEATURE_VMX), 69 {} 70 }; 71 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 72 73 bool __read_mostly enable_vpid = 1; 74 module_param_named(vpid, enable_vpid, bool, 0444); 75 76 static bool __read_mostly enable_vnmi = 1; 77 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); 78 79 bool __read_mostly flexpriority_enabled = 1; 80 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 81 82 bool __read_mostly enable_ept = 1; 83 module_param_named(ept, enable_ept, bool, S_IRUGO); 84 85 bool __read_mostly enable_unrestricted_guest = 1; 86 module_param_named(unrestricted_guest, 87 enable_unrestricted_guest, bool, S_IRUGO); 88 89 bool __read_mostly enable_ept_ad_bits = 1; 90 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); 91 92 static bool __read_mostly emulate_invalid_guest_state = true; 93 module_param(emulate_invalid_guest_state, bool, S_IRUGO); 94 95 static bool __read_mostly fasteoi = 1; 96 module_param(fasteoi, bool, S_IRUGO); 97 98 static bool __read_mostly enable_apicv = 1; 99 module_param(enable_apicv, bool, S_IRUGO); 100 101 /* 102 * If nested=1, nested virtualization is supported, i.e., guests may use 103 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 104 * use VMX instructions. 105 */ 106 static bool __read_mostly nested = 1; 107 module_param(nested, bool, S_IRUGO); 108 109 static u64 __read_mostly host_xss; 110 111 bool __read_mostly enable_pml = 1; 112 module_param_named(pml, enable_pml, bool, S_IRUGO); 113 114 static bool __read_mostly dump_invalid_vmcs = 0; 115 module_param(dump_invalid_vmcs, bool, 0644); 116 117 #define MSR_BITMAP_MODE_X2APIC 1 118 #define MSR_BITMAP_MODE_X2APIC_APICV 2 119 120 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 121 122 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 123 static int __read_mostly cpu_preemption_timer_multi; 124 static bool __read_mostly enable_preemption_timer = 1; 125 #ifdef CONFIG_X86_64 126 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 127 #endif 128 129 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 130 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 131 #define KVM_VM_CR0_ALWAYS_ON \ 132 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ 133 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) 134 #define KVM_CR4_GUEST_OWNED_BITS \ 135 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 136 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) 137 138 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 139 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 140 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 141 142 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 143 144 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 145 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 146 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 147 RTIT_STATUS_BYTECNT)) 148 149 #define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ 150 (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) 151 152 /* 153 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 154 * ple_gap: upper bound on the amount of time between two successive 155 * executions of PAUSE in a loop. Also indicate if ple enabled. 156 * According to test, this time is usually smaller than 128 cycles. 157 * ple_window: upper bound on the amount of time a guest is allowed to execute 158 * in a PAUSE loop. Tests indicate that most spinlocks are held for 159 * less than 2^12 cycles 160 * Time is measured based on a counter that runs at the same rate as the TSC, 161 * refer SDM volume 3b section 21.6.13 & 22.1.3. 162 */ 163 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 164 module_param(ple_gap, uint, 0444); 165 166 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 167 module_param(ple_window, uint, 0444); 168 169 /* Default doubles per-vcpu window every exit. */ 170 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 171 module_param(ple_window_grow, uint, 0444); 172 173 /* Default resets per-vcpu window every exit to ple_window. */ 174 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 175 module_param(ple_window_shrink, uint, 0444); 176 177 /* Default is to compute the maximum so we can never overflow. */ 178 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 179 module_param(ple_window_max, uint, 0444); 180 181 /* Default is SYSTEM mode, 1 for host-guest mode */ 182 int __read_mostly pt_mode = PT_MODE_SYSTEM; 183 module_param(pt_mode, int, S_IRUGO); 184 185 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 186 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 187 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 188 189 /* Storage for pre module init parameter parsing */ 190 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 191 192 static const struct { 193 const char *option; 194 bool for_parse; 195 } vmentry_l1d_param[] = { 196 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 197 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 198 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 199 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 200 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 201 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 202 }; 203 204 #define L1D_CACHE_ORDER 4 205 static void *vmx_l1d_flush_pages; 206 207 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 208 { 209 struct page *page; 210 unsigned int i; 211 212 if (!boot_cpu_has_bug(X86_BUG_L1TF)) { 213 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 214 return 0; 215 } 216 217 if (!enable_ept) { 218 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 219 return 0; 220 } 221 222 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { 223 u64 msr; 224 225 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); 226 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 227 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 228 return 0; 229 } 230 } 231 232 /* If set to auto use the default l1tf mitigation method */ 233 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 234 switch (l1tf_mitigation) { 235 case L1TF_MITIGATION_OFF: 236 l1tf = VMENTER_L1D_FLUSH_NEVER; 237 break; 238 case L1TF_MITIGATION_FLUSH_NOWARN: 239 case L1TF_MITIGATION_FLUSH: 240 case L1TF_MITIGATION_FLUSH_NOSMT: 241 l1tf = VMENTER_L1D_FLUSH_COND; 242 break; 243 case L1TF_MITIGATION_FULL: 244 case L1TF_MITIGATION_FULL_FORCE: 245 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 246 break; 247 } 248 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 249 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 250 } 251 252 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 253 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 254 /* 255 * This allocation for vmx_l1d_flush_pages is not tied to a VM 256 * lifetime and so should not be charged to a memcg. 257 */ 258 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 259 if (!page) 260 return -ENOMEM; 261 vmx_l1d_flush_pages = page_address(page); 262 263 /* 264 * Initialize each page with a different pattern in 265 * order to protect against KSM in the nested 266 * virtualization case. 267 */ 268 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 269 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 270 PAGE_SIZE); 271 } 272 } 273 274 l1tf_vmx_mitigation = l1tf; 275 276 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 277 static_branch_enable(&vmx_l1d_should_flush); 278 else 279 static_branch_disable(&vmx_l1d_should_flush); 280 281 if (l1tf == VMENTER_L1D_FLUSH_COND) 282 static_branch_enable(&vmx_l1d_flush_cond); 283 else 284 static_branch_disable(&vmx_l1d_flush_cond); 285 return 0; 286 } 287 288 static int vmentry_l1d_flush_parse(const char *s) 289 { 290 unsigned int i; 291 292 if (s) { 293 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 294 if (vmentry_l1d_param[i].for_parse && 295 sysfs_streq(s, vmentry_l1d_param[i].option)) 296 return i; 297 } 298 } 299 return -EINVAL; 300 } 301 302 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 303 { 304 int l1tf, ret; 305 306 l1tf = vmentry_l1d_flush_parse(s); 307 if (l1tf < 0) 308 return l1tf; 309 310 if (!boot_cpu_has(X86_BUG_L1TF)) 311 return 0; 312 313 /* 314 * Has vmx_init() run already? If not then this is the pre init 315 * parameter parsing. In that case just store the value and let 316 * vmx_init() do the proper setup after enable_ept has been 317 * established. 318 */ 319 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 320 vmentry_l1d_flush_param = l1tf; 321 return 0; 322 } 323 324 mutex_lock(&vmx_l1d_flush_mutex); 325 ret = vmx_setup_l1d_flush(l1tf); 326 mutex_unlock(&vmx_l1d_flush_mutex); 327 return ret; 328 } 329 330 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 331 { 332 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 333 return sprintf(s, "???\n"); 334 335 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 336 } 337 338 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 339 .set = vmentry_l1d_flush_set, 340 .get = vmentry_l1d_flush_get, 341 }; 342 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 343 344 static bool guest_state_valid(struct kvm_vcpu *vcpu); 345 static u32 vmx_segment_access_rights(struct kvm_segment *var); 346 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, 347 u32 msr, int type); 348 349 void vmx_vmexit(void); 350 351 #define vmx_insn_failed(fmt...) \ 352 do { \ 353 WARN_ONCE(1, fmt); \ 354 pr_warn_ratelimited(fmt); \ 355 } while (0) 356 357 asmlinkage void vmread_error(unsigned long field, bool fault) 358 { 359 if (fault) 360 kvm_spurious_fault(); 361 else 362 vmx_insn_failed("kvm: vmread failed: field=%lx\n", field); 363 } 364 365 noinline void vmwrite_error(unsigned long field, unsigned long value) 366 { 367 vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%d\n", 368 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 369 } 370 371 noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr) 372 { 373 vmx_insn_failed("kvm: vmclear failed: %p/%llx\n", vmcs, phys_addr); 374 } 375 376 noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr) 377 { 378 vmx_insn_failed("kvm: vmptrld failed: %p/%llx\n", vmcs, phys_addr); 379 } 380 381 noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva) 382 { 383 vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n", 384 ext, vpid, gva); 385 } 386 387 noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa) 388 { 389 vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n", 390 ext, eptp, gpa); 391 } 392 393 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 394 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 395 /* 396 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 397 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 398 */ 399 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 400 401 /* 402 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we 403 * can find which vCPU should be waken up. 404 */ 405 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); 406 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); 407 408 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 409 static DEFINE_SPINLOCK(vmx_vpid_lock); 410 411 struct vmcs_config vmcs_config; 412 struct vmx_capability vmx_capability; 413 414 #define VMX_SEGMENT_FIELD(seg) \ 415 [VCPU_SREG_##seg] = { \ 416 .selector = GUEST_##seg##_SELECTOR, \ 417 .base = GUEST_##seg##_BASE, \ 418 .limit = GUEST_##seg##_LIMIT, \ 419 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 420 } 421 422 static const struct kvm_vmx_segment_field { 423 unsigned selector; 424 unsigned base; 425 unsigned limit; 426 unsigned ar_bytes; 427 } kvm_vmx_segment_fields[] = { 428 VMX_SEGMENT_FIELD(CS), 429 VMX_SEGMENT_FIELD(DS), 430 VMX_SEGMENT_FIELD(ES), 431 VMX_SEGMENT_FIELD(FS), 432 VMX_SEGMENT_FIELD(GS), 433 VMX_SEGMENT_FIELD(SS), 434 VMX_SEGMENT_FIELD(TR), 435 VMX_SEGMENT_FIELD(LDTR), 436 }; 437 438 u64 host_efer; 439 static unsigned long host_idt_base; 440 441 /* 442 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 443 * will emulate SYSCALL in legacy mode if the vendor string in guest 444 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 445 * support this emulation, IA32_STAR must always be included in 446 * vmx_msr_index[], even in i386 builds. 447 */ 448 const u32 vmx_msr_index[] = { 449 #ifdef CONFIG_X86_64 450 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 451 #endif 452 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 453 }; 454 455 #if IS_ENABLED(CONFIG_HYPERV) 456 static bool __read_mostly enlightened_vmcs = true; 457 module_param(enlightened_vmcs, bool, 0444); 458 459 /* check_ept_pointer() should be under protection of ept_pointer_lock. */ 460 static void check_ept_pointer_match(struct kvm *kvm) 461 { 462 struct kvm_vcpu *vcpu; 463 u64 tmp_eptp = INVALID_PAGE; 464 int i; 465 466 kvm_for_each_vcpu(i, vcpu, kvm) { 467 if (!VALID_PAGE(tmp_eptp)) { 468 tmp_eptp = to_vmx(vcpu)->ept_pointer; 469 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { 470 to_kvm_vmx(kvm)->ept_pointers_match 471 = EPT_POINTERS_MISMATCH; 472 return; 473 } 474 } 475 476 to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; 477 } 478 479 static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, 480 void *data) 481 { 482 struct kvm_tlb_range *range = data; 483 484 return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, 485 range->pages); 486 } 487 488 static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm, 489 struct kvm_vcpu *vcpu, struct kvm_tlb_range *range) 490 { 491 u64 ept_pointer = to_vmx(vcpu)->ept_pointer; 492 493 /* 494 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address 495 * of the base of EPT PML4 table, strip off EPT configuration 496 * information. 497 */ 498 if (range) 499 return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK, 500 kvm_fill_hv_flush_list_func, (void *)range); 501 else 502 return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK); 503 } 504 505 static int hv_remote_flush_tlb_with_range(struct kvm *kvm, 506 struct kvm_tlb_range *range) 507 { 508 struct kvm_vcpu *vcpu; 509 int ret = 0, i; 510 511 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); 512 513 if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) 514 check_ept_pointer_match(kvm); 515 516 if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { 517 kvm_for_each_vcpu(i, vcpu, kvm) { 518 /* If ept_pointer is invalid pointer, bypass flush request. */ 519 if (VALID_PAGE(to_vmx(vcpu)->ept_pointer)) 520 ret |= __hv_remote_flush_tlb_with_range( 521 kvm, vcpu, range); 522 } 523 } else { 524 ret = __hv_remote_flush_tlb_with_range(kvm, 525 kvm_get_vcpu(kvm, 0), range); 526 } 527 528 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); 529 return ret; 530 } 531 static int hv_remote_flush_tlb(struct kvm *kvm) 532 { 533 return hv_remote_flush_tlb_with_range(kvm, NULL); 534 } 535 536 static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu) 537 { 538 struct hv_enlightened_vmcs *evmcs; 539 struct hv_partition_assist_pg **p_hv_pa_pg = 540 &vcpu->kvm->arch.hyperv.hv_pa_pg; 541 /* 542 * Synthetic VM-Exit is not enabled in current code and so All 543 * evmcs in singe VM shares same assist page. 544 */ 545 if (!*p_hv_pa_pg) 546 *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL); 547 548 if (!*p_hv_pa_pg) 549 return -ENOMEM; 550 551 evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs; 552 553 evmcs->partition_assist_page = 554 __pa(*p_hv_pa_pg); 555 evmcs->hv_vm_id = (unsigned long)vcpu->kvm; 556 evmcs->hv_enlightenments_control.nested_flush_hypercall = 1; 557 558 return 0; 559 } 560 561 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 562 563 /* 564 * Comment's format: document - errata name - stepping - processor name. 565 * Refer from 566 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 567 */ 568 static u32 vmx_preemption_cpu_tfms[] = { 569 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 570 0x000206E6, 571 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 572 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 573 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 574 0x00020652, 575 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 576 0x00020655, 577 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 578 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 579 /* 580 * 320767.pdf - AAP86 - B1 - 581 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 582 */ 583 0x000106E5, 584 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 585 0x000106A0, 586 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 587 0x000106A1, 588 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 589 0x000106A4, 590 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 591 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 592 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 593 0x000106A5, 594 /* Xeon E3-1220 V2 */ 595 0x000306A8, 596 }; 597 598 static inline bool cpu_has_broken_vmx_preemption_timer(void) 599 { 600 u32 eax = cpuid_eax(0x00000001), i; 601 602 /* Clear the reserved bits */ 603 eax &= ~(0x3U << 14 | 0xfU << 28); 604 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 605 if (eax == vmx_preemption_cpu_tfms[i]) 606 return true; 607 608 return false; 609 } 610 611 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 612 { 613 return flexpriority_enabled && lapic_in_kernel(vcpu); 614 } 615 616 static inline bool report_flexpriority(void) 617 { 618 return flexpriority_enabled; 619 } 620 621 static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 622 { 623 int i; 624 625 for (i = 0; i < vmx->nmsrs; ++i) 626 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) 627 return i; 628 return -1; 629 } 630 631 struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 632 { 633 int i; 634 635 i = __find_msr_index(vmx, msr); 636 if (i >= 0) 637 return &vmx->guest_msrs[i]; 638 return NULL; 639 } 640 641 void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) 642 { 643 vmcs_clear(loaded_vmcs->vmcs); 644 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 645 vmcs_clear(loaded_vmcs->shadow_vmcs); 646 loaded_vmcs->cpu = -1; 647 loaded_vmcs->launched = 0; 648 } 649 650 #ifdef CONFIG_KEXEC_CORE 651 /* 652 * This bitmap is used to indicate whether the vmclear 653 * operation is enabled on all cpus. All disabled by 654 * default. 655 */ 656 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; 657 658 static inline void crash_enable_local_vmclear(int cpu) 659 { 660 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); 661 } 662 663 static inline void crash_disable_local_vmclear(int cpu) 664 { 665 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); 666 } 667 668 static inline int crash_local_vmclear_enabled(int cpu) 669 { 670 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); 671 } 672 673 static void crash_vmclear_local_loaded_vmcss(void) 674 { 675 int cpu = raw_smp_processor_id(); 676 struct loaded_vmcs *v; 677 678 if (!crash_local_vmclear_enabled(cpu)) 679 return; 680 681 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 682 loaded_vmcss_on_cpu_link) 683 vmcs_clear(v->vmcs); 684 } 685 #else 686 static inline void crash_enable_local_vmclear(int cpu) { } 687 static inline void crash_disable_local_vmclear(int cpu) { } 688 #endif /* CONFIG_KEXEC_CORE */ 689 690 static void __loaded_vmcs_clear(void *arg) 691 { 692 struct loaded_vmcs *loaded_vmcs = arg; 693 int cpu = raw_smp_processor_id(); 694 695 if (loaded_vmcs->cpu != cpu) 696 return; /* vcpu migration can race with cpu offline */ 697 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 698 per_cpu(current_vmcs, cpu) = NULL; 699 crash_disable_local_vmclear(cpu); 700 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 701 702 /* 703 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link 704 * is before setting loaded_vmcs->vcpu to -1 which is done in 705 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist 706 * then adds the vmcs into percpu list before it is deleted. 707 */ 708 smp_wmb(); 709 710 loaded_vmcs_init(loaded_vmcs); 711 crash_enable_local_vmclear(cpu); 712 } 713 714 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 715 { 716 int cpu = loaded_vmcs->cpu; 717 718 if (cpu != -1) 719 smp_call_function_single(cpu, 720 __loaded_vmcs_clear, loaded_vmcs, 1); 721 } 722 723 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 724 unsigned field) 725 { 726 bool ret; 727 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 728 729 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { 730 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); 731 vmx->segment_cache.bitmask = 0; 732 } 733 ret = vmx->segment_cache.bitmask & mask; 734 vmx->segment_cache.bitmask |= mask; 735 return ret; 736 } 737 738 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 739 { 740 u16 *p = &vmx->segment_cache.seg[seg].selector; 741 742 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 743 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 744 return *p; 745 } 746 747 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 748 { 749 ulong *p = &vmx->segment_cache.seg[seg].base; 750 751 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 752 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 753 return *p; 754 } 755 756 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 757 { 758 u32 *p = &vmx->segment_cache.seg[seg].limit; 759 760 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 761 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 762 return *p; 763 } 764 765 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 766 { 767 u32 *p = &vmx->segment_cache.seg[seg].ar; 768 769 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 770 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 771 return *p; 772 } 773 774 void update_exception_bitmap(struct kvm_vcpu *vcpu) 775 { 776 u32 eb; 777 778 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 779 (1u << DB_VECTOR) | (1u << AC_VECTOR); 780 /* 781 * Guest access to VMware backdoor ports could legitimately 782 * trigger #GP because of TSS I/O permission bitmap. 783 * We intercept those #GP and allow access to them anyway 784 * as VMware does. 785 */ 786 if (enable_vmware_backdoor) 787 eb |= (1u << GP_VECTOR); 788 if ((vcpu->guest_debug & 789 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 790 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 791 eb |= 1u << BP_VECTOR; 792 if (to_vmx(vcpu)->rmode.vm86_active) 793 eb = ~0; 794 if (enable_ept) 795 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 796 797 /* When we are running a nested L2 guest and L1 specified for it a 798 * certain exception bitmap, we must trap the same exceptions and pass 799 * them to L1. When running L2, we will only handle the exceptions 800 * specified above if L1 did not want them. 801 */ 802 if (is_guest_mode(vcpu)) 803 eb |= get_vmcs12(vcpu)->exception_bitmap; 804 805 vmcs_write32(EXCEPTION_BITMAP, eb); 806 } 807 808 /* 809 * Check if MSR is intercepted for currently loaded MSR bitmap. 810 */ 811 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) 812 { 813 unsigned long *msr_bitmap; 814 int f = sizeof(unsigned long); 815 816 if (!cpu_has_vmx_msr_bitmap()) 817 return true; 818 819 msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; 820 821 if (msr <= 0x1fff) { 822 return !!test_bit(msr, msr_bitmap + 0x800 / f); 823 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 824 msr &= 0x1fff; 825 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 826 } 827 828 return true; 829 } 830 831 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 832 unsigned long entry, unsigned long exit) 833 { 834 vm_entry_controls_clearbit(vmx, entry); 835 vm_exit_controls_clearbit(vmx, exit); 836 } 837 838 static int find_msr(struct vmx_msrs *m, unsigned int msr) 839 { 840 unsigned int i; 841 842 for (i = 0; i < m->nr; ++i) { 843 if (m->val[i].index == msr) 844 return i; 845 } 846 return -ENOENT; 847 } 848 849 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 850 { 851 int i; 852 struct msr_autoload *m = &vmx->msr_autoload; 853 854 switch (msr) { 855 case MSR_EFER: 856 if (cpu_has_load_ia32_efer()) { 857 clear_atomic_switch_msr_special(vmx, 858 VM_ENTRY_LOAD_IA32_EFER, 859 VM_EXIT_LOAD_IA32_EFER); 860 return; 861 } 862 break; 863 case MSR_CORE_PERF_GLOBAL_CTRL: 864 if (cpu_has_load_perf_global_ctrl()) { 865 clear_atomic_switch_msr_special(vmx, 866 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 867 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 868 return; 869 } 870 break; 871 } 872 i = find_msr(&m->guest, msr); 873 if (i < 0) 874 goto skip_guest; 875 --m->guest.nr; 876 m->guest.val[i] = m->guest.val[m->guest.nr]; 877 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 878 879 skip_guest: 880 i = find_msr(&m->host, msr); 881 if (i < 0) 882 return; 883 884 --m->host.nr; 885 m->host.val[i] = m->host.val[m->host.nr]; 886 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 887 } 888 889 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 890 unsigned long entry, unsigned long exit, 891 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 892 u64 guest_val, u64 host_val) 893 { 894 vmcs_write64(guest_val_vmcs, guest_val); 895 if (host_val_vmcs != HOST_IA32_EFER) 896 vmcs_write64(host_val_vmcs, host_val); 897 vm_entry_controls_setbit(vmx, entry); 898 vm_exit_controls_setbit(vmx, exit); 899 } 900 901 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 902 u64 guest_val, u64 host_val, bool entry_only) 903 { 904 int i, j = 0; 905 struct msr_autoload *m = &vmx->msr_autoload; 906 907 switch (msr) { 908 case MSR_EFER: 909 if (cpu_has_load_ia32_efer()) { 910 add_atomic_switch_msr_special(vmx, 911 VM_ENTRY_LOAD_IA32_EFER, 912 VM_EXIT_LOAD_IA32_EFER, 913 GUEST_IA32_EFER, 914 HOST_IA32_EFER, 915 guest_val, host_val); 916 return; 917 } 918 break; 919 case MSR_CORE_PERF_GLOBAL_CTRL: 920 if (cpu_has_load_perf_global_ctrl()) { 921 add_atomic_switch_msr_special(vmx, 922 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 923 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 924 GUEST_IA32_PERF_GLOBAL_CTRL, 925 HOST_IA32_PERF_GLOBAL_CTRL, 926 guest_val, host_val); 927 return; 928 } 929 break; 930 case MSR_IA32_PEBS_ENABLE: 931 /* PEBS needs a quiescent period after being disabled (to write 932 * a record). Disabling PEBS through VMX MSR swapping doesn't 933 * provide that period, so a CPU could write host's record into 934 * guest's memory. 935 */ 936 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 937 } 938 939 i = find_msr(&m->guest, msr); 940 if (!entry_only) 941 j = find_msr(&m->host, msr); 942 943 if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) || 944 (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) { 945 printk_once(KERN_WARNING "Not enough msr switch entries. " 946 "Can't add msr %x\n", msr); 947 return; 948 } 949 if (i < 0) { 950 i = m->guest.nr++; 951 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 952 } 953 m->guest.val[i].index = msr; 954 m->guest.val[i].value = guest_val; 955 956 if (entry_only) 957 return; 958 959 if (j < 0) { 960 j = m->host.nr++; 961 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 962 } 963 m->host.val[j].index = msr; 964 m->host.val[j].value = host_val; 965 } 966 967 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) 968 { 969 u64 guest_efer = vmx->vcpu.arch.efer; 970 u64 ignore_bits = 0; 971 972 if (!enable_ept) { 973 /* 974 * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing 975 * host CPUID is more efficient than testing guest CPUID 976 * or CR4. Host SMEP is anyway a requirement for guest SMEP. 977 */ 978 if (boot_cpu_has(X86_FEATURE_SMEP)) 979 guest_efer |= EFER_NX; 980 else if (!(guest_efer & EFER_NX)) 981 ignore_bits |= EFER_NX; 982 } 983 984 /* 985 * LMA and LME handled by hardware; SCE meaningless outside long mode. 986 */ 987 ignore_bits |= EFER_SCE; 988 #ifdef CONFIG_X86_64 989 ignore_bits |= EFER_LMA | EFER_LME; 990 /* SCE is meaningful only in long mode on Intel */ 991 if (guest_efer & EFER_LMA) 992 ignore_bits &= ~(u64)EFER_SCE; 993 #endif 994 995 /* 996 * On EPT, we can't emulate NX, so we must switch EFER atomically. 997 * On CPUs that support "load IA32_EFER", always switch EFER 998 * atomically, since it's faster than switching it manually. 999 */ 1000 if (cpu_has_load_ia32_efer() || 1001 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { 1002 if (!(guest_efer & EFER_LMA)) 1003 guest_efer &= ~EFER_LME; 1004 if (guest_efer != host_efer) 1005 add_atomic_switch_msr(vmx, MSR_EFER, 1006 guest_efer, host_efer, false); 1007 else 1008 clear_atomic_switch_msr(vmx, MSR_EFER); 1009 return false; 1010 } else { 1011 clear_atomic_switch_msr(vmx, MSR_EFER); 1012 1013 guest_efer &= ~ignore_bits; 1014 guest_efer |= host_efer & ignore_bits; 1015 1016 vmx->guest_msrs[efer_offset].data = guest_efer; 1017 vmx->guest_msrs[efer_offset].mask = ~ignore_bits; 1018 1019 return true; 1020 } 1021 } 1022 1023 #ifdef CONFIG_X86_32 1024 /* 1025 * On 32-bit kernels, VM exits still load the FS and GS bases from the 1026 * VMCS rather than the segment table. KVM uses this helper to figure 1027 * out the current bases to poke them into the VMCS before entry. 1028 */ 1029 static unsigned long segment_base(u16 selector) 1030 { 1031 struct desc_struct *table; 1032 unsigned long v; 1033 1034 if (!(selector & ~SEGMENT_RPL_MASK)) 1035 return 0; 1036 1037 table = get_current_gdt_ro(); 1038 1039 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 1040 u16 ldt_selector = kvm_read_ldt(); 1041 1042 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 1043 return 0; 1044 1045 table = (struct desc_struct *)segment_base(ldt_selector); 1046 } 1047 v = get_desc_base(&table[selector >> 3]); 1048 return v; 1049 } 1050 #endif 1051 1052 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 1053 { 1054 u32 i; 1055 1056 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1057 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1058 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1059 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1060 for (i = 0; i < addr_range; i++) { 1061 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1062 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1063 } 1064 } 1065 1066 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 1067 { 1068 u32 i; 1069 1070 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 1071 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 1072 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1073 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1074 for (i = 0; i < addr_range; i++) { 1075 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1076 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1077 } 1078 } 1079 1080 static void pt_guest_enter(struct vcpu_vmx *vmx) 1081 { 1082 if (pt_mode == PT_MODE_SYSTEM) 1083 return; 1084 1085 /* 1086 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1087 * Save host state before VM entry. 1088 */ 1089 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1090 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1091 wrmsrl(MSR_IA32_RTIT_CTL, 0); 1092 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); 1093 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); 1094 } 1095 } 1096 1097 static void pt_guest_exit(struct vcpu_vmx *vmx) 1098 { 1099 if (pt_mode == PT_MODE_SYSTEM) 1100 return; 1101 1102 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1103 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); 1104 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); 1105 } 1106 1107 /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ 1108 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1109 } 1110 1111 void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel, 1112 unsigned long fs_base, unsigned long gs_base) 1113 { 1114 if (unlikely(fs_sel != host->fs_sel)) { 1115 if (!(fs_sel & 7)) 1116 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1117 else 1118 vmcs_write16(HOST_FS_SELECTOR, 0); 1119 host->fs_sel = fs_sel; 1120 } 1121 if (unlikely(gs_sel != host->gs_sel)) { 1122 if (!(gs_sel & 7)) 1123 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1124 else 1125 vmcs_write16(HOST_GS_SELECTOR, 0); 1126 host->gs_sel = gs_sel; 1127 } 1128 if (unlikely(fs_base != host->fs_base)) { 1129 vmcs_writel(HOST_FS_BASE, fs_base); 1130 host->fs_base = fs_base; 1131 } 1132 if (unlikely(gs_base != host->gs_base)) { 1133 vmcs_writel(HOST_GS_BASE, gs_base); 1134 host->gs_base = gs_base; 1135 } 1136 } 1137 1138 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1139 { 1140 struct vcpu_vmx *vmx = to_vmx(vcpu); 1141 struct vmcs_host_state *host_state; 1142 #ifdef CONFIG_X86_64 1143 int cpu = raw_smp_processor_id(); 1144 #endif 1145 unsigned long fs_base, gs_base; 1146 u16 fs_sel, gs_sel; 1147 int i; 1148 1149 vmx->req_immediate_exit = false; 1150 1151 /* 1152 * Note that guest MSRs to be saved/restored can also be changed 1153 * when guest state is loaded. This happens when guest transitions 1154 * to/from long-mode by setting MSR_EFER.LMA. 1155 */ 1156 if (!vmx->guest_msrs_ready) { 1157 vmx->guest_msrs_ready = true; 1158 for (i = 0; i < vmx->save_nmsrs; ++i) 1159 kvm_set_shared_msr(vmx->guest_msrs[i].index, 1160 vmx->guest_msrs[i].data, 1161 vmx->guest_msrs[i].mask); 1162 1163 } 1164 if (vmx->guest_state_loaded) 1165 return; 1166 1167 host_state = &vmx->loaded_vmcs->host_state; 1168 1169 /* 1170 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1171 * allow segment selectors with cpl > 0 or ti == 1. 1172 */ 1173 host_state->ldt_sel = kvm_read_ldt(); 1174 1175 #ifdef CONFIG_X86_64 1176 savesegment(ds, host_state->ds_sel); 1177 savesegment(es, host_state->es_sel); 1178 1179 gs_base = cpu_kernelmode_gs_base(cpu); 1180 if (likely(is_64bit_mm(current->mm))) { 1181 save_fsgs_for_kvm(); 1182 fs_sel = current->thread.fsindex; 1183 gs_sel = current->thread.gsindex; 1184 fs_base = current->thread.fsbase; 1185 vmx->msr_host_kernel_gs_base = current->thread.gsbase; 1186 } else { 1187 savesegment(fs, fs_sel); 1188 savesegment(gs, gs_sel); 1189 fs_base = read_msr(MSR_FS_BASE); 1190 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1191 } 1192 1193 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1194 #else 1195 savesegment(fs, fs_sel); 1196 savesegment(gs, gs_sel); 1197 fs_base = segment_base(fs_sel); 1198 gs_base = segment_base(gs_sel); 1199 #endif 1200 1201 vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base); 1202 vmx->guest_state_loaded = true; 1203 } 1204 1205 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1206 { 1207 struct vmcs_host_state *host_state; 1208 1209 if (!vmx->guest_state_loaded) 1210 return; 1211 1212 host_state = &vmx->loaded_vmcs->host_state; 1213 1214 ++vmx->vcpu.stat.host_state_reload; 1215 1216 #ifdef CONFIG_X86_64 1217 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1218 #endif 1219 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1220 kvm_load_ldt(host_state->ldt_sel); 1221 #ifdef CONFIG_X86_64 1222 load_gs_index(host_state->gs_sel); 1223 #else 1224 loadsegment(gs, host_state->gs_sel); 1225 #endif 1226 } 1227 if (host_state->fs_sel & 7) 1228 loadsegment(fs, host_state->fs_sel); 1229 #ifdef CONFIG_X86_64 1230 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1231 loadsegment(ds, host_state->ds_sel); 1232 loadsegment(es, host_state->es_sel); 1233 } 1234 #endif 1235 invalidate_tss_limit(); 1236 #ifdef CONFIG_X86_64 1237 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1238 #endif 1239 load_fixmap_gdt(raw_smp_processor_id()); 1240 vmx->guest_state_loaded = false; 1241 vmx->guest_msrs_ready = false; 1242 } 1243 1244 #ifdef CONFIG_X86_64 1245 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1246 { 1247 preempt_disable(); 1248 if (vmx->guest_state_loaded) 1249 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1250 preempt_enable(); 1251 return vmx->msr_guest_kernel_gs_base; 1252 } 1253 1254 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1255 { 1256 preempt_disable(); 1257 if (vmx->guest_state_loaded) 1258 wrmsrl(MSR_KERNEL_GS_BASE, data); 1259 preempt_enable(); 1260 vmx->msr_guest_kernel_gs_base = data; 1261 } 1262 #endif 1263 1264 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) 1265 { 1266 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 1267 struct pi_desc old, new; 1268 unsigned int dest; 1269 1270 /* 1271 * In case of hot-plug or hot-unplug, we may have to undo 1272 * vmx_vcpu_pi_put even if there is no assigned device. And we 1273 * always keep PI.NDST up to date for simplicity: it makes the 1274 * code easier, and CPU migration is not a fast path. 1275 */ 1276 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) 1277 return; 1278 1279 /* The full case. */ 1280 do { 1281 old.control = new.control = pi_desc->control; 1282 1283 dest = cpu_physical_id(cpu); 1284 1285 if (x2apic_enabled()) 1286 new.ndst = dest; 1287 else 1288 new.ndst = (dest << 8) & 0xFF00; 1289 1290 new.sn = 0; 1291 } while (cmpxchg64(&pi_desc->control, old.control, 1292 new.control) != old.control); 1293 1294 /* 1295 * Clear SN before reading the bitmap. The VT-d firmware 1296 * writes the bitmap and reads SN atomically (5.2.3 in the 1297 * spec), so it doesn't really have a memory barrier that 1298 * pairs with this, but we cannot do that and we need one. 1299 */ 1300 smp_mb__after_atomic(); 1301 1302 if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) 1303 pi_set_on(pi_desc); 1304 } 1305 1306 void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) 1307 { 1308 struct vcpu_vmx *vmx = to_vmx(vcpu); 1309 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1310 1311 if (!already_loaded) { 1312 loaded_vmcs_clear(vmx->loaded_vmcs); 1313 local_irq_disable(); 1314 crash_disable_local_vmclear(cpu); 1315 1316 /* 1317 * Read loaded_vmcs->cpu should be before fetching 1318 * loaded_vmcs->loaded_vmcss_on_cpu_link. 1319 * See the comments in __loaded_vmcs_clear(). 1320 */ 1321 smp_rmb(); 1322 1323 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1324 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1325 crash_enable_local_vmclear(cpu); 1326 local_irq_enable(); 1327 } 1328 1329 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { 1330 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1331 vmcs_load(vmx->loaded_vmcs->vmcs); 1332 indirect_branch_prediction_barrier(); 1333 } 1334 1335 if (!already_loaded) { 1336 void *gdt = get_current_gdt_ro(); 1337 unsigned long sysenter_esp; 1338 1339 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1340 1341 /* 1342 * Linux uses per-cpu TSS and GDT, so set these when switching 1343 * processors. See 22.2.4. 1344 */ 1345 vmcs_writel(HOST_TR_BASE, 1346 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1347 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1348 1349 /* 1350 * VM exits change the host TR limit to 0x67 after a VM 1351 * exit. This is okay, since 0x67 covers everything except 1352 * the IO bitmap and have have code to handle the IO bitmap 1353 * being lost after a VM exit. 1354 */ 1355 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); 1356 1357 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 1358 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1359 1360 vmx->loaded_vmcs->cpu = cpu; 1361 } 1362 1363 /* Setup TSC multiplier */ 1364 if (kvm_has_tsc_control && 1365 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) 1366 decache_tsc_multiplier(vmx); 1367 } 1368 1369 /* 1370 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1371 * vcpu mutex is already taken. 1372 */ 1373 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1374 { 1375 struct vcpu_vmx *vmx = to_vmx(vcpu); 1376 1377 vmx_vcpu_load_vmcs(vcpu, cpu); 1378 1379 vmx_vcpu_pi_load(vcpu, cpu); 1380 1381 vmx->host_pkru = read_pkru(); 1382 vmx->host_debugctlmsr = get_debugctlmsr(); 1383 } 1384 1385 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) 1386 { 1387 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 1388 1389 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 1390 !irq_remapping_cap(IRQ_POSTING_CAP) || 1391 !kvm_vcpu_apicv_active(vcpu)) 1392 return; 1393 1394 /* Set SN when the vCPU is preempted */ 1395 if (vcpu->preempted) 1396 pi_set_sn(pi_desc); 1397 } 1398 1399 static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1400 { 1401 vmx_vcpu_pi_put(vcpu); 1402 1403 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1404 } 1405 1406 static bool emulation_required(struct kvm_vcpu *vcpu) 1407 { 1408 return emulate_invalid_guest_state && !guest_state_valid(vcpu); 1409 } 1410 1411 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 1412 1413 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1414 { 1415 unsigned long rflags, save_rflags; 1416 1417 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { 1418 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 1419 rflags = vmcs_readl(GUEST_RFLAGS); 1420 if (to_vmx(vcpu)->rmode.vm86_active) { 1421 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1422 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 1423 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1424 } 1425 to_vmx(vcpu)->rflags = rflags; 1426 } 1427 return to_vmx(vcpu)->rflags; 1428 } 1429 1430 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1431 { 1432 unsigned long old_rflags = vmx_get_rflags(vcpu); 1433 1434 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 1435 to_vmx(vcpu)->rflags = rflags; 1436 if (to_vmx(vcpu)->rmode.vm86_active) { 1437 to_vmx(vcpu)->rmode.save_rflags = rflags; 1438 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1439 } 1440 vmcs_writel(GUEST_RFLAGS, rflags); 1441 1442 if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) 1443 to_vmx(vcpu)->emulation_required = emulation_required(vcpu); 1444 } 1445 1446 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1447 { 1448 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1449 int ret = 0; 1450 1451 if (interruptibility & GUEST_INTR_STATE_STI) 1452 ret |= KVM_X86_SHADOW_INT_STI; 1453 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1454 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1455 1456 return ret; 1457 } 1458 1459 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1460 { 1461 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1462 u32 interruptibility = interruptibility_old; 1463 1464 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1465 1466 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1467 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1468 else if (mask & KVM_X86_SHADOW_INT_STI) 1469 interruptibility |= GUEST_INTR_STATE_STI; 1470 1471 if ((interruptibility != interruptibility_old)) 1472 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1473 } 1474 1475 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1476 { 1477 struct vcpu_vmx *vmx = to_vmx(vcpu); 1478 unsigned long value; 1479 1480 /* 1481 * Any MSR write that attempts to change bits marked reserved will 1482 * case a #GP fault. 1483 */ 1484 if (data & vmx->pt_desc.ctl_bitmask) 1485 return 1; 1486 1487 /* 1488 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1489 * result in a #GP unless the same write also clears TraceEn. 1490 */ 1491 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1492 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) 1493 return 1; 1494 1495 /* 1496 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1497 * and FabricEn would cause #GP, if 1498 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1499 */ 1500 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1501 !(data & RTIT_CTL_FABRIC_EN) && 1502 !intel_pt_validate_cap(vmx->pt_desc.caps, 1503 PT_CAP_single_range_output)) 1504 return 1; 1505 1506 /* 1507 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1508 * utilize encodings marked reserved will casue a #GP fault. 1509 */ 1510 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1511 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1512 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1513 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1514 return 1; 1515 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1516 PT_CAP_cycle_thresholds); 1517 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1518 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1519 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1520 return 1; 1521 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1522 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1523 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1524 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1525 return 1; 1526 1527 /* 1528 * If ADDRx_CFG is reserved or the encodings is >2 will 1529 * cause a #GP fault. 1530 */ 1531 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1532 if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) 1533 return 1; 1534 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1535 if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) 1536 return 1; 1537 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1538 if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) 1539 return 1; 1540 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1541 if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) 1542 return 1; 1543 1544 return 0; 1545 } 1546 1547 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 1548 { 1549 unsigned long rip; 1550 1551 /* 1552 * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on 1553 * undefined behavior: Intel's SDM doesn't mandate the VMCS field be 1554 * set when EPT misconfig occurs. In practice, real hardware updates 1555 * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors 1556 * (namely Hyper-V) don't set it due to it being undefined behavior, 1557 * i.e. we end up advancing IP with some random value. 1558 */ 1559 if (!static_cpu_has(X86_FEATURE_HYPERVISOR) || 1560 to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) { 1561 rip = kvm_rip_read(vcpu); 1562 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1563 kvm_rip_write(vcpu, rip); 1564 } else { 1565 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 1566 return 0; 1567 } 1568 1569 /* skipping an emulated instruction also counts */ 1570 vmx_set_interrupt_shadow(vcpu, 0); 1571 1572 return 1; 1573 } 1574 1575 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1576 { 1577 /* 1578 * Ensure that we clear the HLT state in the VMCS. We don't need to 1579 * explicitly skip the instruction because if the HLT state is set, 1580 * then the instruction is already executing and RIP has already been 1581 * advanced. 1582 */ 1583 if (kvm_hlt_in_guest(vcpu->kvm) && 1584 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1585 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1586 } 1587 1588 static void vmx_queue_exception(struct kvm_vcpu *vcpu) 1589 { 1590 struct vcpu_vmx *vmx = to_vmx(vcpu); 1591 unsigned nr = vcpu->arch.exception.nr; 1592 bool has_error_code = vcpu->arch.exception.has_error_code; 1593 u32 error_code = vcpu->arch.exception.error_code; 1594 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1595 1596 kvm_deliver_exception_payload(vcpu); 1597 1598 if (has_error_code) { 1599 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 1600 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1601 } 1602 1603 if (vmx->rmode.vm86_active) { 1604 int inc_eip = 0; 1605 if (kvm_exception_is_soft(nr)) 1606 inc_eip = vcpu->arch.event_exit_inst_len; 1607 kvm_inject_realmode_interrupt(vcpu, nr, inc_eip); 1608 return; 1609 } 1610 1611 WARN_ON_ONCE(vmx->emulation_required); 1612 1613 if (kvm_exception_is_soft(nr)) { 1614 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1615 vmx->vcpu.arch.event_exit_inst_len); 1616 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1617 } else 1618 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1619 1620 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1621 1622 vmx_clear_hlt(vcpu); 1623 } 1624 1625 static bool vmx_rdtscp_supported(void) 1626 { 1627 return cpu_has_vmx_rdtscp(); 1628 } 1629 1630 static bool vmx_invpcid_supported(void) 1631 { 1632 return cpu_has_vmx_invpcid(); 1633 } 1634 1635 /* 1636 * Swap MSR entry in host/guest MSR entry array. 1637 */ 1638 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 1639 { 1640 struct shared_msr_entry tmp; 1641 1642 tmp = vmx->guest_msrs[to]; 1643 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 1644 vmx->guest_msrs[from] = tmp; 1645 } 1646 1647 /* 1648 * Set up the vmcs to automatically save and restore system 1649 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 1650 * mode, as fiddling with msrs is very expensive. 1651 */ 1652 static void setup_msrs(struct vcpu_vmx *vmx) 1653 { 1654 int save_nmsrs, index; 1655 1656 save_nmsrs = 0; 1657 #ifdef CONFIG_X86_64 1658 /* 1659 * The SYSCALL MSRs are only needed on long mode guests, and only 1660 * when EFER.SCE is set. 1661 */ 1662 if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { 1663 index = __find_msr_index(vmx, MSR_STAR); 1664 if (index >= 0) 1665 move_msr_up(vmx, index, save_nmsrs++); 1666 index = __find_msr_index(vmx, MSR_LSTAR); 1667 if (index >= 0) 1668 move_msr_up(vmx, index, save_nmsrs++); 1669 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 1670 if (index >= 0) 1671 move_msr_up(vmx, index, save_nmsrs++); 1672 } 1673 #endif 1674 index = __find_msr_index(vmx, MSR_EFER); 1675 if (index >= 0 && update_transition_efer(vmx, index)) 1676 move_msr_up(vmx, index, save_nmsrs++); 1677 index = __find_msr_index(vmx, MSR_TSC_AUX); 1678 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) 1679 move_msr_up(vmx, index, save_nmsrs++); 1680 1681 vmx->save_nmsrs = save_nmsrs; 1682 vmx->guest_msrs_ready = false; 1683 1684 if (cpu_has_vmx_msr_bitmap()) 1685 vmx_update_msr_bitmap(&vmx->vcpu); 1686 } 1687 1688 static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) 1689 { 1690 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1691 1692 if (is_guest_mode(vcpu) && 1693 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) 1694 return vcpu->arch.tsc_offset - vmcs12->tsc_offset; 1695 1696 return vcpu->arch.tsc_offset; 1697 } 1698 1699 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1700 { 1701 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1702 u64 g_tsc_offset = 0; 1703 1704 /* 1705 * We're here if L1 chose not to trap WRMSR to TSC. According 1706 * to the spec, this should set L1's TSC; The offset that L1 1707 * set for L2 remains unchanged, and still needs to be added 1708 * to the newly set TSC to get L2's TSC. 1709 */ 1710 if (is_guest_mode(vcpu) && 1711 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) 1712 g_tsc_offset = vmcs12->tsc_offset; 1713 1714 trace_kvm_write_tsc_offset(vcpu->vcpu_id, 1715 vcpu->arch.tsc_offset - g_tsc_offset, 1716 offset); 1717 vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); 1718 return offset + g_tsc_offset; 1719 } 1720 1721 /* 1722 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX 1723 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for 1724 * all guests if the "nested" module option is off, and can also be disabled 1725 * for a single guest by disabling its VMX cpuid bit. 1726 */ 1727 bool nested_vmx_allowed(struct kvm_vcpu *vcpu) 1728 { 1729 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); 1730 } 1731 1732 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, 1733 uint64_t val) 1734 { 1735 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; 1736 1737 return !(val & ~valid_bits); 1738 } 1739 1740 static int vmx_get_msr_feature(struct kvm_msr_entry *msr) 1741 { 1742 switch (msr->index) { 1743 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 1744 if (!nested) 1745 return 1; 1746 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); 1747 default: 1748 return 1; 1749 } 1750 1751 return 0; 1752 } 1753 1754 /* 1755 * Reads an msr value (of 'msr_index') into 'pdata'. 1756 * Returns 0 on success, non-0 otherwise. 1757 * Assumes vcpu_load() was already called. 1758 */ 1759 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1760 { 1761 struct vcpu_vmx *vmx = to_vmx(vcpu); 1762 struct shared_msr_entry *msr; 1763 u32 index; 1764 1765 switch (msr_info->index) { 1766 #ifdef CONFIG_X86_64 1767 case MSR_FS_BASE: 1768 msr_info->data = vmcs_readl(GUEST_FS_BASE); 1769 break; 1770 case MSR_GS_BASE: 1771 msr_info->data = vmcs_readl(GUEST_GS_BASE); 1772 break; 1773 case MSR_KERNEL_GS_BASE: 1774 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 1775 break; 1776 #endif 1777 case MSR_EFER: 1778 return kvm_get_msr_common(vcpu, msr_info); 1779 case MSR_IA32_UMWAIT_CONTROL: 1780 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 1781 return 1; 1782 1783 msr_info->data = vmx->msr_ia32_umwait_control; 1784 break; 1785 case MSR_IA32_SPEC_CTRL: 1786 if (!msr_info->host_initiated && 1787 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 1788 return 1; 1789 1790 msr_info->data = to_vmx(vcpu)->spec_ctrl; 1791 break; 1792 case MSR_IA32_SYSENTER_CS: 1793 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 1794 break; 1795 case MSR_IA32_SYSENTER_EIP: 1796 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 1797 break; 1798 case MSR_IA32_SYSENTER_ESP: 1799 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 1800 break; 1801 case MSR_IA32_BNDCFGS: 1802 if (!kvm_mpx_supported() || 1803 (!msr_info->host_initiated && 1804 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 1805 return 1; 1806 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 1807 break; 1808 case MSR_IA32_MCG_EXT_CTL: 1809 if (!msr_info->host_initiated && 1810 !(vmx->msr_ia32_feature_control & 1811 FEATURE_CONTROL_LMCE)) 1812 return 1; 1813 msr_info->data = vcpu->arch.mcg_ext_ctl; 1814 break; 1815 case MSR_IA32_FEATURE_CONTROL: 1816 msr_info->data = vmx->msr_ia32_feature_control; 1817 break; 1818 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 1819 if (!nested_vmx_allowed(vcpu)) 1820 return 1; 1821 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 1822 &msr_info->data); 1823 case MSR_IA32_XSS: 1824 if (!vmx_xsaves_supported() || 1825 (!msr_info->host_initiated && 1826 !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 1827 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) 1828 return 1; 1829 msr_info->data = vcpu->arch.ia32_xss; 1830 break; 1831 case MSR_IA32_RTIT_CTL: 1832 if (pt_mode != PT_MODE_HOST_GUEST) 1833 return 1; 1834 msr_info->data = vmx->pt_desc.guest.ctl; 1835 break; 1836 case MSR_IA32_RTIT_STATUS: 1837 if (pt_mode != PT_MODE_HOST_GUEST) 1838 return 1; 1839 msr_info->data = vmx->pt_desc.guest.status; 1840 break; 1841 case MSR_IA32_RTIT_CR3_MATCH: 1842 if ((pt_mode != PT_MODE_HOST_GUEST) || 1843 !intel_pt_validate_cap(vmx->pt_desc.caps, 1844 PT_CAP_cr3_filtering)) 1845 return 1; 1846 msr_info->data = vmx->pt_desc.guest.cr3_match; 1847 break; 1848 case MSR_IA32_RTIT_OUTPUT_BASE: 1849 if ((pt_mode != PT_MODE_HOST_GUEST) || 1850 (!intel_pt_validate_cap(vmx->pt_desc.caps, 1851 PT_CAP_topa_output) && 1852 !intel_pt_validate_cap(vmx->pt_desc.caps, 1853 PT_CAP_single_range_output))) 1854 return 1; 1855 msr_info->data = vmx->pt_desc.guest.output_base; 1856 break; 1857 case MSR_IA32_RTIT_OUTPUT_MASK: 1858 if ((pt_mode != PT_MODE_HOST_GUEST) || 1859 (!intel_pt_validate_cap(vmx->pt_desc.caps, 1860 PT_CAP_topa_output) && 1861 !intel_pt_validate_cap(vmx->pt_desc.caps, 1862 PT_CAP_single_range_output))) 1863 return 1; 1864 msr_info->data = vmx->pt_desc.guest.output_mask; 1865 break; 1866 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 1867 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 1868 if ((pt_mode != PT_MODE_HOST_GUEST) || 1869 (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, 1870 PT_CAP_num_address_ranges))) 1871 return 1; 1872 if (index % 2) 1873 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 1874 else 1875 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 1876 break; 1877 case MSR_TSC_AUX: 1878 if (!msr_info->host_initiated && 1879 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 1880 return 1; 1881 /* Else, falls through */ 1882 default: 1883 msr = find_msr_entry(vmx, msr_info->index); 1884 if (msr) { 1885 msr_info->data = msr->data; 1886 break; 1887 } 1888 return kvm_get_msr_common(vcpu, msr_info); 1889 } 1890 1891 return 0; 1892 } 1893 1894 /* 1895 * Writes msr value into into the appropriate "register". 1896 * Returns 0 on success, non-0 otherwise. 1897 * Assumes vcpu_load() was already called. 1898 */ 1899 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1900 { 1901 struct vcpu_vmx *vmx = to_vmx(vcpu); 1902 struct shared_msr_entry *msr; 1903 int ret = 0; 1904 u32 msr_index = msr_info->index; 1905 u64 data = msr_info->data; 1906 u32 index; 1907 1908 switch (msr_index) { 1909 case MSR_EFER: 1910 ret = kvm_set_msr_common(vcpu, msr_info); 1911 break; 1912 #ifdef CONFIG_X86_64 1913 case MSR_FS_BASE: 1914 vmx_segment_cache_clear(vmx); 1915 vmcs_writel(GUEST_FS_BASE, data); 1916 break; 1917 case MSR_GS_BASE: 1918 vmx_segment_cache_clear(vmx); 1919 vmcs_writel(GUEST_GS_BASE, data); 1920 break; 1921 case MSR_KERNEL_GS_BASE: 1922 vmx_write_guest_kernel_gs_base(vmx, data); 1923 break; 1924 #endif 1925 case MSR_IA32_SYSENTER_CS: 1926 if (is_guest_mode(vcpu)) 1927 get_vmcs12(vcpu)->guest_sysenter_cs = data; 1928 vmcs_write32(GUEST_SYSENTER_CS, data); 1929 break; 1930 case MSR_IA32_SYSENTER_EIP: 1931 if (is_guest_mode(vcpu)) 1932 get_vmcs12(vcpu)->guest_sysenter_eip = data; 1933 vmcs_writel(GUEST_SYSENTER_EIP, data); 1934 break; 1935 case MSR_IA32_SYSENTER_ESP: 1936 if (is_guest_mode(vcpu)) 1937 get_vmcs12(vcpu)->guest_sysenter_esp = data; 1938 vmcs_writel(GUEST_SYSENTER_ESP, data); 1939 break; 1940 case MSR_IA32_DEBUGCTLMSR: 1941 if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & 1942 VM_EXIT_SAVE_DEBUG_CONTROLS) 1943 get_vmcs12(vcpu)->guest_ia32_debugctl = data; 1944 1945 ret = kvm_set_msr_common(vcpu, msr_info); 1946 break; 1947 1948 case MSR_IA32_BNDCFGS: 1949 if (!kvm_mpx_supported() || 1950 (!msr_info->host_initiated && 1951 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 1952 return 1; 1953 if (is_noncanonical_address(data & PAGE_MASK, vcpu) || 1954 (data & MSR_IA32_BNDCFGS_RSVD)) 1955 return 1; 1956 vmcs_write64(GUEST_BNDCFGS, data); 1957 break; 1958 case MSR_IA32_UMWAIT_CONTROL: 1959 if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) 1960 return 1; 1961 1962 /* The reserved bit 1 and non-32 bit [63:32] should be zero */ 1963 if (data & (BIT_ULL(1) | GENMASK_ULL(63, 32))) 1964 return 1; 1965 1966 vmx->msr_ia32_umwait_control = data; 1967 break; 1968 case MSR_IA32_SPEC_CTRL: 1969 if (!msr_info->host_initiated && 1970 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 1971 return 1; 1972 1973 /* The STIBP bit doesn't fault even if it's not advertised */ 1974 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) 1975 return 1; 1976 1977 vmx->spec_ctrl = data; 1978 1979 if (!data) 1980 break; 1981 1982 /* 1983 * For non-nested: 1984 * When it's written (to non-zero) for the first time, pass 1985 * it through. 1986 * 1987 * For nested: 1988 * The handling of the MSR bitmap for L2 guests is done in 1989 * nested_vmx_merge_msr_bitmap. We should not touch the 1990 * vmcs02.msr_bitmap here since it gets completely overwritten 1991 * in the merging. We update the vmcs01 here for L1 as well 1992 * since it will end up touching the MSR anyway now. 1993 */ 1994 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, 1995 MSR_IA32_SPEC_CTRL, 1996 MSR_TYPE_RW); 1997 break; 1998 case MSR_IA32_PRED_CMD: 1999 if (!msr_info->host_initiated && 2000 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 2001 return 1; 2002 2003 if (data & ~PRED_CMD_IBPB) 2004 return 1; 2005 2006 if (!data) 2007 break; 2008 2009 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); 2010 2011 /* 2012 * For non-nested: 2013 * When it's written (to non-zero) for the first time, pass 2014 * it through. 2015 * 2016 * For nested: 2017 * The handling of the MSR bitmap for L2 guests is done in 2018 * nested_vmx_merge_msr_bitmap. We should not touch the 2019 * vmcs02.msr_bitmap here since it gets completely overwritten 2020 * in the merging. 2021 */ 2022 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, 2023 MSR_TYPE_W); 2024 break; 2025 case MSR_IA32_CR_PAT: 2026 if (!kvm_pat_valid(data)) 2027 return 1; 2028 2029 if (is_guest_mode(vcpu) && 2030 get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 2031 get_vmcs12(vcpu)->guest_ia32_pat = data; 2032 2033 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2034 vmcs_write64(GUEST_IA32_PAT, data); 2035 vcpu->arch.pat = data; 2036 break; 2037 } 2038 ret = kvm_set_msr_common(vcpu, msr_info); 2039 break; 2040 case MSR_IA32_TSC_ADJUST: 2041 ret = kvm_set_msr_common(vcpu, msr_info); 2042 break; 2043 case MSR_IA32_MCG_EXT_CTL: 2044 if ((!msr_info->host_initiated && 2045 !(to_vmx(vcpu)->msr_ia32_feature_control & 2046 FEATURE_CONTROL_LMCE)) || 2047 (data & ~MCG_EXT_CTL_LMCE_EN)) 2048 return 1; 2049 vcpu->arch.mcg_ext_ctl = data; 2050 break; 2051 case MSR_IA32_FEATURE_CONTROL: 2052 if (!vmx_feature_control_msr_valid(vcpu, data) || 2053 (to_vmx(vcpu)->msr_ia32_feature_control & 2054 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) 2055 return 1; 2056 vmx->msr_ia32_feature_control = data; 2057 if (msr_info->host_initiated && data == 0) 2058 vmx_leave_nested(vcpu); 2059 break; 2060 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2061 if (!msr_info->host_initiated) 2062 return 1; /* they are read-only */ 2063 if (!nested_vmx_allowed(vcpu)) 2064 return 1; 2065 return vmx_set_vmx_msr(vcpu, msr_index, data); 2066 case MSR_IA32_XSS: 2067 if (!vmx_xsaves_supported() || 2068 (!msr_info->host_initiated && 2069 !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 2070 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) 2071 return 1; 2072 /* 2073 * The only supported bit as of Skylake is bit 8, but 2074 * it is not supported on KVM. 2075 */ 2076 if (data != 0) 2077 return 1; 2078 vcpu->arch.ia32_xss = data; 2079 if (vcpu->arch.ia32_xss != host_xss) 2080 add_atomic_switch_msr(vmx, MSR_IA32_XSS, 2081 vcpu->arch.ia32_xss, host_xss, false); 2082 else 2083 clear_atomic_switch_msr(vmx, MSR_IA32_XSS); 2084 break; 2085 case MSR_IA32_RTIT_CTL: 2086 if ((pt_mode != PT_MODE_HOST_GUEST) || 2087 vmx_rtit_ctl_check(vcpu, data) || 2088 vmx->nested.vmxon) 2089 return 1; 2090 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 2091 vmx->pt_desc.guest.ctl = data; 2092 pt_update_intercept_for_msr(vmx); 2093 break; 2094 case MSR_IA32_RTIT_STATUS: 2095 if ((pt_mode != PT_MODE_HOST_GUEST) || 2096 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2097 (data & MSR_IA32_RTIT_STATUS_MASK)) 2098 return 1; 2099 vmx->pt_desc.guest.status = data; 2100 break; 2101 case MSR_IA32_RTIT_CR3_MATCH: 2102 if ((pt_mode != PT_MODE_HOST_GUEST) || 2103 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2104 !intel_pt_validate_cap(vmx->pt_desc.caps, 2105 PT_CAP_cr3_filtering)) 2106 return 1; 2107 vmx->pt_desc.guest.cr3_match = data; 2108 break; 2109 case MSR_IA32_RTIT_OUTPUT_BASE: 2110 if ((pt_mode != PT_MODE_HOST_GUEST) || 2111 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2112 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2113 PT_CAP_topa_output) && 2114 !intel_pt_validate_cap(vmx->pt_desc.caps, 2115 PT_CAP_single_range_output)) || 2116 (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)) 2117 return 1; 2118 vmx->pt_desc.guest.output_base = data; 2119 break; 2120 case MSR_IA32_RTIT_OUTPUT_MASK: 2121 if ((pt_mode != PT_MODE_HOST_GUEST) || 2122 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2123 (!intel_pt_validate_cap(vmx->pt_desc.caps, 2124 PT_CAP_topa_output) && 2125 !intel_pt_validate_cap(vmx->pt_desc.caps, 2126 PT_CAP_single_range_output))) 2127 return 1; 2128 vmx->pt_desc.guest.output_mask = data; 2129 break; 2130 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 2131 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 2132 if ((pt_mode != PT_MODE_HOST_GUEST) || 2133 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 2134 (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, 2135 PT_CAP_num_address_ranges))) 2136 return 1; 2137 if (index % 2) 2138 vmx->pt_desc.guest.addr_b[index / 2] = data; 2139 else 2140 vmx->pt_desc.guest.addr_a[index / 2] = data; 2141 break; 2142 case MSR_TSC_AUX: 2143 if (!msr_info->host_initiated && 2144 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 2145 return 1; 2146 /* Check reserved bit, higher 32 bits should be zero */ 2147 if ((data >> 32) != 0) 2148 return 1; 2149 /* Else, falls through */ 2150 default: 2151 msr = find_msr_entry(vmx, msr_index); 2152 if (msr) { 2153 u64 old_msr_data = msr->data; 2154 msr->data = data; 2155 if (msr - vmx->guest_msrs < vmx->save_nmsrs) { 2156 preempt_disable(); 2157 ret = kvm_set_shared_msr(msr->index, msr->data, 2158 msr->mask); 2159 preempt_enable(); 2160 if (ret) 2161 msr->data = old_msr_data; 2162 } 2163 break; 2164 } 2165 ret = kvm_set_msr_common(vcpu, msr_info); 2166 } 2167 2168 return ret; 2169 } 2170 2171 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2172 { 2173 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); 2174 switch (reg) { 2175 case VCPU_REGS_RSP: 2176 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2177 break; 2178 case VCPU_REGS_RIP: 2179 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2180 break; 2181 case VCPU_EXREG_PDPTR: 2182 if (enable_ept) 2183 ept_save_pdptrs(vcpu); 2184 break; 2185 default: 2186 break; 2187 } 2188 } 2189 2190 static __init int cpu_has_kvm_support(void) 2191 { 2192 return cpu_has_vmx(); 2193 } 2194 2195 static __init int vmx_disabled_by_bios(void) 2196 { 2197 u64 msr; 2198 2199 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 2200 if (msr & FEATURE_CONTROL_LOCKED) { 2201 /* launched w/ TXT and VMX disabled */ 2202 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2203 && tboot_enabled()) 2204 return 1; 2205 /* launched w/o TXT and VMX only enabled w/ TXT */ 2206 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2207 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2208 && !tboot_enabled()) { 2209 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 2210 "activate TXT before enabling KVM\n"); 2211 return 1; 2212 } 2213 /* launched w/o TXT and VMX disabled */ 2214 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2215 && !tboot_enabled()) 2216 return 1; 2217 } 2218 2219 return 0; 2220 } 2221 2222 static void kvm_cpu_vmxon(u64 addr) 2223 { 2224 cr4_set_bits(X86_CR4_VMXE); 2225 intel_pt_handle_vmx(1); 2226 2227 asm volatile ("vmxon %0" : : "m"(addr)); 2228 } 2229 2230 static int hardware_enable(void) 2231 { 2232 int cpu = raw_smp_processor_id(); 2233 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2234 u64 old, test_bits; 2235 2236 if (cr4_read_shadow() & X86_CR4_VMXE) 2237 return -EBUSY; 2238 2239 /* 2240 * This can happen if we hot-added a CPU but failed to allocate 2241 * VP assist page for it. 2242 */ 2243 if (static_branch_unlikely(&enable_evmcs) && 2244 !hv_get_vp_assist_page(cpu)) 2245 return -EFAULT; 2246 2247 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2248 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); 2249 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 2250 2251 /* 2252 * Now we can enable the vmclear operation in kdump 2253 * since the loaded_vmcss_on_cpu list on this cpu 2254 * has been initialized. 2255 * 2256 * Though the cpu is not in VMX operation now, there 2257 * is no problem to enable the vmclear operation 2258 * for the loaded_vmcss_on_cpu list is empty! 2259 */ 2260 crash_enable_local_vmclear(cpu); 2261 2262 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2263 2264 test_bits = FEATURE_CONTROL_LOCKED; 2265 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 2266 if (tboot_enabled()) 2267 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; 2268 2269 if ((old & test_bits) != test_bits) { 2270 /* enable and lock */ 2271 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 2272 } 2273 kvm_cpu_vmxon(phys_addr); 2274 if (enable_ept) 2275 ept_sync_global(); 2276 2277 return 0; 2278 } 2279 2280 static void vmclear_local_loaded_vmcss(void) 2281 { 2282 int cpu = raw_smp_processor_id(); 2283 struct loaded_vmcs *v, *n; 2284 2285 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2286 loaded_vmcss_on_cpu_link) 2287 __loaded_vmcs_clear(v); 2288 } 2289 2290 2291 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() 2292 * tricks. 2293 */ 2294 static void kvm_cpu_vmxoff(void) 2295 { 2296 asm volatile (__ex("vmxoff")); 2297 2298 intel_pt_handle_vmx(0); 2299 cr4_clear_bits(X86_CR4_VMXE); 2300 } 2301 2302 static void hardware_disable(void) 2303 { 2304 vmclear_local_loaded_vmcss(); 2305 kvm_cpu_vmxoff(); 2306 } 2307 2308 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 2309 u32 msr, u32 *result) 2310 { 2311 u32 vmx_msr_low, vmx_msr_high; 2312 u32 ctl = ctl_min | ctl_opt; 2313 2314 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2315 2316 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2317 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2318 2319 /* Ensure minimum (required) set of control bits are supported. */ 2320 if (ctl_min & ~ctl) 2321 return -EIO; 2322 2323 *result = ctl; 2324 return 0; 2325 } 2326 2327 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2328 struct vmx_capability *vmx_cap) 2329 { 2330 u32 vmx_msr_low, vmx_msr_high; 2331 u32 min, opt, min2, opt2; 2332 u32 _pin_based_exec_control = 0; 2333 u32 _cpu_based_exec_control = 0; 2334 u32 _cpu_based_2nd_exec_control = 0; 2335 u32 _vmexit_control = 0; 2336 u32 _vmentry_control = 0; 2337 2338 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2339 min = CPU_BASED_HLT_EXITING | 2340 #ifdef CONFIG_X86_64 2341 CPU_BASED_CR8_LOAD_EXITING | 2342 CPU_BASED_CR8_STORE_EXITING | 2343 #endif 2344 CPU_BASED_CR3_LOAD_EXITING | 2345 CPU_BASED_CR3_STORE_EXITING | 2346 CPU_BASED_UNCOND_IO_EXITING | 2347 CPU_BASED_MOV_DR_EXITING | 2348 CPU_BASED_USE_TSC_OFFSETING | 2349 CPU_BASED_MWAIT_EXITING | 2350 CPU_BASED_MONITOR_EXITING | 2351 CPU_BASED_INVLPG_EXITING | 2352 CPU_BASED_RDPMC_EXITING; 2353 2354 opt = CPU_BASED_TPR_SHADOW | 2355 CPU_BASED_USE_MSR_BITMAPS | 2356 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2357 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 2358 &_cpu_based_exec_control) < 0) 2359 return -EIO; 2360 #ifdef CONFIG_X86_64 2361 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2362 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 2363 ~CPU_BASED_CR8_STORE_EXITING; 2364 #endif 2365 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2366 min2 = 0; 2367 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2368 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2369 SECONDARY_EXEC_WBINVD_EXITING | 2370 SECONDARY_EXEC_ENABLE_VPID | 2371 SECONDARY_EXEC_ENABLE_EPT | 2372 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2373 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2374 SECONDARY_EXEC_DESC | 2375 SECONDARY_EXEC_RDTSCP | 2376 SECONDARY_EXEC_ENABLE_INVPCID | 2377 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2378 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2379 SECONDARY_EXEC_SHADOW_VMCS | 2380 SECONDARY_EXEC_XSAVES | 2381 SECONDARY_EXEC_RDSEED_EXITING | 2382 SECONDARY_EXEC_RDRAND_EXITING | 2383 SECONDARY_EXEC_ENABLE_PML | 2384 SECONDARY_EXEC_TSC_SCALING | 2385 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2386 SECONDARY_EXEC_PT_USE_GPA | 2387 SECONDARY_EXEC_PT_CONCEAL_VMX | 2388 SECONDARY_EXEC_ENABLE_VMFUNC | 2389 SECONDARY_EXEC_ENCLS_EXITING; 2390 if (adjust_vmx_controls(min2, opt2, 2391 MSR_IA32_VMX_PROCBASED_CTLS2, 2392 &_cpu_based_2nd_exec_control) < 0) 2393 return -EIO; 2394 } 2395 #ifndef CONFIG_X86_64 2396 if (!(_cpu_based_2nd_exec_control & 2397 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2398 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2399 #endif 2400 2401 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2402 _cpu_based_2nd_exec_control &= ~( 2403 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2404 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2405 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2406 2407 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2408 &vmx_cap->ept, &vmx_cap->vpid); 2409 2410 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 2411 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 2412 enabled */ 2413 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 2414 CPU_BASED_CR3_STORE_EXITING | 2415 CPU_BASED_INVLPG_EXITING); 2416 } else if (vmx_cap->ept) { 2417 vmx_cap->ept = 0; 2418 pr_warn_once("EPT CAP should not exist if not support " 2419 "1-setting enable EPT VM-execution control\n"); 2420 } 2421 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2422 vmx_cap->vpid) { 2423 vmx_cap->vpid = 0; 2424 pr_warn_once("VPID CAP should not exist if not support " 2425 "1-setting enable VPID VM-execution control\n"); 2426 } 2427 2428 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; 2429 #ifdef CONFIG_X86_64 2430 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 2431 #endif 2432 opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | 2433 VM_EXIT_LOAD_IA32_PAT | 2434 VM_EXIT_LOAD_IA32_EFER | 2435 VM_EXIT_CLEAR_BNDCFGS | 2436 VM_EXIT_PT_CONCEAL_PIP | 2437 VM_EXIT_CLEAR_IA32_RTIT_CTL; 2438 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 2439 &_vmexit_control) < 0) 2440 return -EIO; 2441 2442 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 2443 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | 2444 PIN_BASED_VMX_PREEMPTION_TIMER; 2445 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 2446 &_pin_based_exec_control) < 0) 2447 return -EIO; 2448 2449 if (cpu_has_broken_vmx_preemption_timer()) 2450 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2451 if (!(_cpu_based_2nd_exec_control & 2452 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2453 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2454 2455 min = VM_ENTRY_LOAD_DEBUG_CONTROLS; 2456 opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 2457 VM_ENTRY_LOAD_IA32_PAT | 2458 VM_ENTRY_LOAD_IA32_EFER | 2459 VM_ENTRY_LOAD_BNDCFGS | 2460 VM_ENTRY_PT_CONCEAL_PIP | 2461 VM_ENTRY_LOAD_IA32_RTIT_CTL; 2462 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 2463 &_vmentry_control) < 0) 2464 return -EIO; 2465 2466 /* 2467 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2468 * can't be used due to an errata where VM Exit may incorrectly clear 2469 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2470 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2471 */ 2472 if (boot_cpu_data.x86 == 0x6) { 2473 switch (boot_cpu_data.x86_model) { 2474 case 26: /* AAK155 */ 2475 case 30: /* AAP115 */ 2476 case 37: /* AAT100 */ 2477 case 44: /* BC86,AAY89,BD102 */ 2478 case 46: /* BA97 */ 2479 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2480 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2481 pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2482 "does not work properly. Using workaround\n"); 2483 break; 2484 default: 2485 break; 2486 } 2487 } 2488 2489 2490 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 2491 2492 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2493 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 2494 return -EIO; 2495 2496 #ifdef CONFIG_X86_64 2497 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 2498 if (vmx_msr_high & (1u<<16)) 2499 return -EIO; 2500 #endif 2501 2502 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2503 if (((vmx_msr_high >> 18) & 15) != 6) 2504 return -EIO; 2505 2506 vmcs_conf->size = vmx_msr_high & 0x1fff; 2507 vmcs_conf->order = get_order(vmcs_conf->size); 2508 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; 2509 2510 vmcs_conf->revision_id = vmx_msr_low; 2511 2512 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2513 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2514 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2515 vmcs_conf->vmexit_ctrl = _vmexit_control; 2516 vmcs_conf->vmentry_ctrl = _vmentry_control; 2517 2518 if (static_branch_unlikely(&enable_evmcs)) 2519 evmcs_sanitize_exec_ctrls(vmcs_conf); 2520 2521 return 0; 2522 } 2523 2524 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2525 { 2526 int node = cpu_to_node(cpu); 2527 struct page *pages; 2528 struct vmcs *vmcs; 2529 2530 pages = __alloc_pages_node(node, flags, vmcs_config.order); 2531 if (!pages) 2532 return NULL; 2533 vmcs = page_address(pages); 2534 memset(vmcs, 0, vmcs_config.size); 2535 2536 /* KVM supports Enlightened VMCS v1 only */ 2537 if (static_branch_unlikely(&enable_evmcs)) 2538 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2539 else 2540 vmcs->hdr.revision_id = vmcs_config.revision_id; 2541 2542 if (shadow) 2543 vmcs->hdr.shadow_vmcs = 1; 2544 return vmcs; 2545 } 2546 2547 void free_vmcs(struct vmcs *vmcs) 2548 { 2549 free_pages((unsigned long)vmcs, vmcs_config.order); 2550 } 2551 2552 /* 2553 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2554 */ 2555 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2556 { 2557 if (!loaded_vmcs->vmcs) 2558 return; 2559 loaded_vmcs_clear(loaded_vmcs); 2560 free_vmcs(loaded_vmcs->vmcs); 2561 loaded_vmcs->vmcs = NULL; 2562 if (loaded_vmcs->msr_bitmap) 2563 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2564 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2565 } 2566 2567 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2568 { 2569 loaded_vmcs->vmcs = alloc_vmcs(false); 2570 if (!loaded_vmcs->vmcs) 2571 return -ENOMEM; 2572 2573 loaded_vmcs->shadow_vmcs = NULL; 2574 loaded_vmcs->hv_timer_soft_disabled = false; 2575 loaded_vmcs_init(loaded_vmcs); 2576 2577 if (cpu_has_vmx_msr_bitmap()) { 2578 loaded_vmcs->msr_bitmap = (unsigned long *) 2579 __get_free_page(GFP_KERNEL_ACCOUNT); 2580 if (!loaded_vmcs->msr_bitmap) 2581 goto out_vmcs; 2582 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2583 2584 if (IS_ENABLED(CONFIG_HYPERV) && 2585 static_branch_unlikely(&enable_evmcs) && 2586 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 2587 struct hv_enlightened_vmcs *evmcs = 2588 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; 2589 2590 evmcs->hv_enlightenments_control.msr_bitmap = 1; 2591 } 2592 } 2593 2594 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2595 memset(&loaded_vmcs->controls_shadow, 0, 2596 sizeof(struct vmcs_controls_shadow)); 2597 2598 return 0; 2599 2600 out_vmcs: 2601 free_loaded_vmcs(loaded_vmcs); 2602 return -ENOMEM; 2603 } 2604 2605 static void free_kvm_area(void) 2606 { 2607 int cpu; 2608 2609 for_each_possible_cpu(cpu) { 2610 free_vmcs(per_cpu(vmxarea, cpu)); 2611 per_cpu(vmxarea, cpu) = NULL; 2612 } 2613 } 2614 2615 static __init int alloc_kvm_area(void) 2616 { 2617 int cpu; 2618 2619 for_each_possible_cpu(cpu) { 2620 struct vmcs *vmcs; 2621 2622 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 2623 if (!vmcs) { 2624 free_kvm_area(); 2625 return -ENOMEM; 2626 } 2627 2628 /* 2629 * When eVMCS is enabled, alloc_vmcs_cpu() sets 2630 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 2631 * revision_id reported by MSR_IA32_VMX_BASIC. 2632 * 2633 * However, even though not explicitly documented by 2634 * TLFS, VMXArea passed as VMXON argument should 2635 * still be marked with revision_id reported by 2636 * physical CPU. 2637 */ 2638 if (static_branch_unlikely(&enable_evmcs)) 2639 vmcs->hdr.revision_id = vmcs_config.revision_id; 2640 2641 per_cpu(vmxarea, cpu) = vmcs; 2642 } 2643 return 0; 2644 } 2645 2646 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 2647 struct kvm_segment *save) 2648 { 2649 if (!emulate_invalid_guest_state) { 2650 /* 2651 * CS and SS RPL should be equal during guest entry according 2652 * to VMX spec, but in reality it is not always so. Since vcpu 2653 * is in the middle of the transition from real mode to 2654 * protected mode it is safe to assume that RPL 0 is a good 2655 * default value. 2656 */ 2657 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 2658 save->selector &= ~SEGMENT_RPL_MASK; 2659 save->dpl = save->selector & SEGMENT_RPL_MASK; 2660 save->s = 1; 2661 } 2662 vmx_set_segment(vcpu, save, seg); 2663 } 2664 2665 static void enter_pmode(struct kvm_vcpu *vcpu) 2666 { 2667 unsigned long flags; 2668 struct vcpu_vmx *vmx = to_vmx(vcpu); 2669 2670 /* 2671 * Update real mode segment cache. It may be not up-to-date if sement 2672 * register was written while vcpu was in a guest mode. 2673 */ 2674 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2675 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 2676 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 2677 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 2678 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 2679 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 2680 2681 vmx->rmode.vm86_active = 0; 2682 2683 vmx_segment_cache_clear(vmx); 2684 2685 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2686 2687 flags = vmcs_readl(GUEST_RFLAGS); 2688 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 2689 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 2690 vmcs_writel(GUEST_RFLAGS, flags); 2691 2692 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 2693 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 2694 2695 update_exception_bitmap(vcpu); 2696 2697 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 2698 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 2699 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 2700 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 2701 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 2702 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 2703 } 2704 2705 static void fix_rmode_seg(int seg, struct kvm_segment *save) 2706 { 2707 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2708 struct kvm_segment var = *save; 2709 2710 var.dpl = 0x3; 2711 if (seg == VCPU_SREG_CS) 2712 var.type = 0x3; 2713 2714 if (!emulate_invalid_guest_state) { 2715 var.selector = var.base >> 4; 2716 var.base = var.base & 0xffff0; 2717 var.limit = 0xffff; 2718 var.g = 0; 2719 var.db = 0; 2720 var.present = 1; 2721 var.s = 1; 2722 var.l = 0; 2723 var.unusable = 0; 2724 var.type = 0x3; 2725 var.avl = 0; 2726 if (save->base & 0xf) 2727 printk_once(KERN_WARNING "kvm: segment base is not " 2728 "paragraph aligned when entering " 2729 "protected mode (seg=%d)", seg); 2730 } 2731 2732 vmcs_write16(sf->selector, var.selector); 2733 vmcs_writel(sf->base, var.base); 2734 vmcs_write32(sf->limit, var.limit); 2735 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 2736 } 2737 2738 static void enter_rmode(struct kvm_vcpu *vcpu) 2739 { 2740 unsigned long flags; 2741 struct vcpu_vmx *vmx = to_vmx(vcpu); 2742 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 2743 2744 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2745 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2746 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 2747 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 2748 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 2749 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 2750 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 2751 2752 vmx->rmode.vm86_active = 1; 2753 2754 /* 2755 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 2756 * vcpu. Warn the user that an update is overdue. 2757 */ 2758 if (!kvm_vmx->tss_addr) 2759 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 2760 "called before entering vcpu\n"); 2761 2762 vmx_segment_cache_clear(vmx); 2763 2764 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 2765 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 2766 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 2767 2768 flags = vmcs_readl(GUEST_RFLAGS); 2769 vmx->rmode.save_rflags = flags; 2770 2771 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 2772 2773 vmcs_writel(GUEST_RFLAGS, flags); 2774 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 2775 update_exception_bitmap(vcpu); 2776 2777 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 2778 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 2779 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 2780 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 2781 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 2782 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 2783 2784 kvm_mmu_reset_context(vcpu); 2785 } 2786 2787 void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 2788 { 2789 struct vcpu_vmx *vmx = to_vmx(vcpu); 2790 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 2791 2792 if (!msr) 2793 return; 2794 2795 vcpu->arch.efer = efer; 2796 if (efer & EFER_LMA) { 2797 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 2798 msr->data = efer; 2799 } else { 2800 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 2801 2802 msr->data = efer & ~EFER_LME; 2803 } 2804 setup_msrs(vmx); 2805 } 2806 2807 #ifdef CONFIG_X86_64 2808 2809 static void enter_lmode(struct kvm_vcpu *vcpu) 2810 { 2811 u32 guest_tr_ar; 2812 2813 vmx_segment_cache_clear(to_vmx(vcpu)); 2814 2815 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 2816 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 2817 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 2818 __func__); 2819 vmcs_write32(GUEST_TR_AR_BYTES, 2820 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 2821 | VMX_AR_TYPE_BUSY_64_TSS); 2822 } 2823 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 2824 } 2825 2826 static void exit_lmode(struct kvm_vcpu *vcpu) 2827 { 2828 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 2829 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 2830 } 2831 2832 #endif 2833 2834 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 2835 { 2836 int vpid = to_vmx(vcpu)->vpid; 2837 2838 if (!vpid_sync_vcpu_addr(vpid, addr)) 2839 vpid_sync_context(vpid); 2840 2841 /* 2842 * If VPIDs are not supported or enabled, then the above is a no-op. 2843 * But we don't really need a TLB flush in that case anyway, because 2844 * each VM entry/exit includes an implicit flush when VPID is 0. 2845 */ 2846 } 2847 2848 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 2849 { 2850 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2851 2852 vcpu->arch.cr0 &= ~cr0_guest_owned_bits; 2853 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 2854 } 2855 2856 static void vmx_decache_cr3(struct kvm_vcpu *vcpu) 2857 { 2858 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) 2859 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2860 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 2861 } 2862 2863 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 2864 { 2865 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2866 2867 vcpu->arch.cr4 &= ~cr4_guest_owned_bits; 2868 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; 2869 } 2870 2871 static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 2872 { 2873 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 2874 2875 if (!test_bit(VCPU_EXREG_PDPTR, 2876 (unsigned long *)&vcpu->arch.regs_dirty)) 2877 return; 2878 2879 if (is_pae_paging(vcpu)) { 2880 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 2881 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 2882 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 2883 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 2884 } 2885 } 2886 2887 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 2888 { 2889 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 2890 2891 if (is_pae_paging(vcpu)) { 2892 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 2893 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 2894 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 2895 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 2896 } 2897 2898 __set_bit(VCPU_EXREG_PDPTR, 2899 (unsigned long *)&vcpu->arch.regs_avail); 2900 __set_bit(VCPU_EXREG_PDPTR, 2901 (unsigned long *)&vcpu->arch.regs_dirty); 2902 } 2903 2904 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 2905 unsigned long cr0, 2906 struct kvm_vcpu *vcpu) 2907 { 2908 struct vcpu_vmx *vmx = to_vmx(vcpu); 2909 2910 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) 2911 vmx_decache_cr3(vcpu); 2912 if (!(cr0 & X86_CR0_PG)) { 2913 /* From paging/starting to nonpaging */ 2914 exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | 2915 CPU_BASED_CR3_STORE_EXITING); 2916 vcpu->arch.cr0 = cr0; 2917 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 2918 } else if (!is_paging(vcpu)) { 2919 /* From nonpaging to paging */ 2920 exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | 2921 CPU_BASED_CR3_STORE_EXITING); 2922 vcpu->arch.cr0 = cr0; 2923 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 2924 } 2925 2926 if (!(cr0 & X86_CR0_WP)) 2927 *hw_cr0 &= ~X86_CR0_WP; 2928 } 2929 2930 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 2931 { 2932 struct vcpu_vmx *vmx = to_vmx(vcpu); 2933 unsigned long hw_cr0; 2934 2935 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 2936 if (enable_unrestricted_guest) 2937 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 2938 else { 2939 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 2940 2941 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 2942 enter_pmode(vcpu); 2943 2944 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 2945 enter_rmode(vcpu); 2946 } 2947 2948 #ifdef CONFIG_X86_64 2949 if (vcpu->arch.efer & EFER_LME) { 2950 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 2951 enter_lmode(vcpu); 2952 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 2953 exit_lmode(vcpu); 2954 } 2955 #endif 2956 2957 if (enable_ept && !enable_unrestricted_guest) 2958 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 2959 2960 vmcs_writel(CR0_READ_SHADOW, cr0); 2961 vmcs_writel(GUEST_CR0, hw_cr0); 2962 vcpu->arch.cr0 = cr0; 2963 2964 /* depends on vcpu->arch.cr0 to be set to a new value */ 2965 vmx->emulation_required = emulation_required(vcpu); 2966 } 2967 2968 static int get_ept_level(struct kvm_vcpu *vcpu) 2969 { 2970 if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) 2971 return 5; 2972 return 4; 2973 } 2974 2975 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) 2976 { 2977 u64 eptp = VMX_EPTP_MT_WB; 2978 2979 eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 2980 2981 if (enable_ept_ad_bits && 2982 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 2983 eptp |= VMX_EPTP_AD_ENABLE_BIT; 2984 eptp |= (root_hpa & PAGE_MASK); 2985 2986 return eptp; 2987 } 2988 2989 void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 2990 { 2991 struct kvm *kvm = vcpu->kvm; 2992 unsigned long guest_cr3; 2993 u64 eptp; 2994 2995 guest_cr3 = cr3; 2996 if (enable_ept) { 2997 eptp = construct_eptp(vcpu, cr3); 2998 vmcs_write64(EPT_POINTER, eptp); 2999 3000 if (kvm_x86_ops->tlb_remote_flush) { 3001 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); 3002 to_vmx(vcpu)->ept_pointer = eptp; 3003 to_kvm_vmx(kvm)->ept_pointers_match 3004 = EPT_POINTERS_CHECK; 3005 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); 3006 } 3007 3008 if (enable_unrestricted_guest || is_paging(vcpu) || 3009 is_guest_mode(vcpu)) 3010 guest_cr3 = kvm_read_cr3(vcpu); 3011 else 3012 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 3013 ept_load_pdptrs(vcpu); 3014 } 3015 3016 vmcs_writel(GUEST_CR3, guest_cr3); 3017 } 3018 3019 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3020 { 3021 struct vcpu_vmx *vmx = to_vmx(vcpu); 3022 /* 3023 * Pass through host's Machine Check Enable value to hw_cr4, which 3024 * is in force while we are in guest mode. Do not let guests control 3025 * this bit, even if host CR4.MCE == 0. 3026 */ 3027 unsigned long hw_cr4; 3028 3029 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 3030 if (enable_unrestricted_guest) 3031 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 3032 else if (vmx->rmode.vm86_active) 3033 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 3034 else 3035 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 3036 3037 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { 3038 if (cr4 & X86_CR4_UMIP) { 3039 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); 3040 hw_cr4 &= ~X86_CR4_UMIP; 3041 } else if (!is_guest_mode(vcpu) || 3042 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) { 3043 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC); 3044 } 3045 } 3046 3047 if (cr4 & X86_CR4_VMXE) { 3048 /* 3049 * To use VMXON (and later other VMX instructions), a guest 3050 * must first be able to turn on cr4.VMXE (see handle_vmon()). 3051 * So basically the check on whether to allow nested VMX 3052 * is here. We operate under the default treatment of SMM, 3053 * so VMX cannot be enabled under SMM. 3054 */ 3055 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) 3056 return 1; 3057 } 3058 3059 if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 3060 return 1; 3061 3062 vcpu->arch.cr4 = cr4; 3063 3064 if (!enable_unrestricted_guest) { 3065 if (enable_ept) { 3066 if (!is_paging(vcpu)) { 3067 hw_cr4 &= ~X86_CR4_PAE; 3068 hw_cr4 |= X86_CR4_PSE; 3069 } else if (!(cr4 & X86_CR4_PAE)) { 3070 hw_cr4 &= ~X86_CR4_PAE; 3071 } 3072 } 3073 3074 /* 3075 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 3076 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 3077 * to be manually disabled when guest switches to non-paging 3078 * mode. 3079 * 3080 * If !enable_unrestricted_guest, the CPU is always running 3081 * with CR0.PG=1 and CR4 needs to be modified. 3082 * If enable_unrestricted_guest, the CPU automatically 3083 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 3084 */ 3085 if (!is_paging(vcpu)) 3086 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 3087 } 3088 3089 vmcs_writel(CR4_READ_SHADOW, cr4); 3090 vmcs_writel(GUEST_CR4, hw_cr4); 3091 return 0; 3092 } 3093 3094 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3095 { 3096 struct vcpu_vmx *vmx = to_vmx(vcpu); 3097 u32 ar; 3098 3099 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3100 *var = vmx->rmode.segs[seg]; 3101 if (seg == VCPU_SREG_TR 3102 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3103 return; 3104 var->base = vmx_read_guest_seg_base(vmx, seg); 3105 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3106 return; 3107 } 3108 var->base = vmx_read_guest_seg_base(vmx, seg); 3109 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3110 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3111 ar = vmx_read_guest_seg_ar(vmx, seg); 3112 var->unusable = (ar >> 16) & 1; 3113 var->type = ar & 15; 3114 var->s = (ar >> 4) & 1; 3115 var->dpl = (ar >> 5) & 3; 3116 /* 3117 * Some userspaces do not preserve unusable property. Since usable 3118 * segment has to be present according to VMX spec we can use present 3119 * property to amend userspace bug by making unusable segment always 3120 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3121 * segment as unusable. 3122 */ 3123 var->present = !var->unusable; 3124 var->avl = (ar >> 12) & 1; 3125 var->l = (ar >> 13) & 1; 3126 var->db = (ar >> 14) & 1; 3127 var->g = (ar >> 15) & 1; 3128 } 3129 3130 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3131 { 3132 struct kvm_segment s; 3133 3134 if (to_vmx(vcpu)->rmode.vm86_active) { 3135 vmx_get_segment(vcpu, &s, seg); 3136 return s.base; 3137 } 3138 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3139 } 3140 3141 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3142 { 3143 struct vcpu_vmx *vmx = to_vmx(vcpu); 3144 3145 if (unlikely(vmx->rmode.vm86_active)) 3146 return 0; 3147 else { 3148 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3149 return VMX_AR_DPL(ar); 3150 } 3151 } 3152 3153 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3154 { 3155 u32 ar; 3156 3157 if (var->unusable || !var->present) 3158 ar = 1 << 16; 3159 else { 3160 ar = var->type & 15; 3161 ar |= (var->s & 1) << 4; 3162 ar |= (var->dpl & 3) << 5; 3163 ar |= (var->present & 1) << 7; 3164 ar |= (var->avl & 1) << 12; 3165 ar |= (var->l & 1) << 13; 3166 ar |= (var->db & 1) << 14; 3167 ar |= (var->g & 1) << 15; 3168 } 3169 3170 return ar; 3171 } 3172 3173 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3174 { 3175 struct vcpu_vmx *vmx = to_vmx(vcpu); 3176 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3177 3178 vmx_segment_cache_clear(vmx); 3179 3180 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3181 vmx->rmode.segs[seg] = *var; 3182 if (seg == VCPU_SREG_TR) 3183 vmcs_write16(sf->selector, var->selector); 3184 else if (var->s) 3185 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3186 goto out; 3187 } 3188 3189 vmcs_writel(sf->base, var->base); 3190 vmcs_write32(sf->limit, var->limit); 3191 vmcs_write16(sf->selector, var->selector); 3192 3193 /* 3194 * Fix the "Accessed" bit in AR field of segment registers for older 3195 * qemu binaries. 3196 * IA32 arch specifies that at the time of processor reset the 3197 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3198 * is setting it to 0 in the userland code. This causes invalid guest 3199 * state vmexit when "unrestricted guest" mode is turned on. 3200 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3201 * tree. Newer qemu binaries with that qemu fix would not need this 3202 * kvm hack. 3203 */ 3204 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) 3205 var->type |= 0x1; /* Accessed */ 3206 3207 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3208 3209 out: 3210 vmx->emulation_required = emulation_required(vcpu); 3211 } 3212 3213 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3214 { 3215 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3216 3217 *db = (ar >> 14) & 1; 3218 *l = (ar >> 13) & 1; 3219 } 3220 3221 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3222 { 3223 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3224 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3225 } 3226 3227 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3228 { 3229 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3230 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3231 } 3232 3233 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3234 { 3235 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3236 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3237 } 3238 3239 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3240 { 3241 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3242 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3243 } 3244 3245 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3246 { 3247 struct kvm_segment var; 3248 u32 ar; 3249 3250 vmx_get_segment(vcpu, &var, seg); 3251 var.dpl = 0x3; 3252 if (seg == VCPU_SREG_CS) 3253 var.type = 0x3; 3254 ar = vmx_segment_access_rights(&var); 3255 3256 if (var.base != (var.selector << 4)) 3257 return false; 3258 if (var.limit != 0xffff) 3259 return false; 3260 if (ar != 0xf3) 3261 return false; 3262 3263 return true; 3264 } 3265 3266 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3267 { 3268 struct kvm_segment cs; 3269 unsigned int cs_rpl; 3270 3271 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3272 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3273 3274 if (cs.unusable) 3275 return false; 3276 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3277 return false; 3278 if (!cs.s) 3279 return false; 3280 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3281 if (cs.dpl > cs_rpl) 3282 return false; 3283 } else { 3284 if (cs.dpl != cs_rpl) 3285 return false; 3286 } 3287 if (!cs.present) 3288 return false; 3289 3290 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3291 return true; 3292 } 3293 3294 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3295 { 3296 struct kvm_segment ss; 3297 unsigned int ss_rpl; 3298 3299 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3300 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3301 3302 if (ss.unusable) 3303 return true; 3304 if (ss.type != 3 && ss.type != 7) 3305 return false; 3306 if (!ss.s) 3307 return false; 3308 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3309 return false; 3310 if (!ss.present) 3311 return false; 3312 3313 return true; 3314 } 3315 3316 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3317 { 3318 struct kvm_segment var; 3319 unsigned int rpl; 3320 3321 vmx_get_segment(vcpu, &var, seg); 3322 rpl = var.selector & SEGMENT_RPL_MASK; 3323 3324 if (var.unusable) 3325 return true; 3326 if (!var.s) 3327 return false; 3328 if (!var.present) 3329 return false; 3330 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3331 if (var.dpl < rpl) /* DPL < RPL */ 3332 return false; 3333 } 3334 3335 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3336 * rights flags 3337 */ 3338 return true; 3339 } 3340 3341 static bool tr_valid(struct kvm_vcpu *vcpu) 3342 { 3343 struct kvm_segment tr; 3344 3345 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3346 3347 if (tr.unusable) 3348 return false; 3349 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3350 return false; 3351 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3352 return false; 3353 if (!tr.present) 3354 return false; 3355 3356 return true; 3357 } 3358 3359 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3360 { 3361 struct kvm_segment ldtr; 3362 3363 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3364 3365 if (ldtr.unusable) 3366 return true; 3367 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3368 return false; 3369 if (ldtr.type != 2) 3370 return false; 3371 if (!ldtr.present) 3372 return false; 3373 3374 return true; 3375 } 3376 3377 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3378 { 3379 struct kvm_segment cs, ss; 3380 3381 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3382 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3383 3384 return ((cs.selector & SEGMENT_RPL_MASK) == 3385 (ss.selector & SEGMENT_RPL_MASK)); 3386 } 3387 3388 /* 3389 * Check if guest state is valid. Returns true if valid, false if 3390 * not. 3391 * We assume that registers are always usable 3392 */ 3393 static bool guest_state_valid(struct kvm_vcpu *vcpu) 3394 { 3395 if (enable_unrestricted_guest) 3396 return true; 3397 3398 /* real mode guest state checks */ 3399 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3400 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3401 return false; 3402 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3403 return false; 3404 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3405 return false; 3406 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3407 return false; 3408 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3409 return false; 3410 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3411 return false; 3412 } else { 3413 /* protected mode guest state checks */ 3414 if (!cs_ss_rpl_check(vcpu)) 3415 return false; 3416 if (!code_segment_valid(vcpu)) 3417 return false; 3418 if (!stack_segment_valid(vcpu)) 3419 return false; 3420 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3421 return false; 3422 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3423 return false; 3424 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3425 return false; 3426 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3427 return false; 3428 if (!tr_valid(vcpu)) 3429 return false; 3430 if (!ldtr_valid(vcpu)) 3431 return false; 3432 } 3433 /* TODO: 3434 * - Add checks on RIP 3435 * - Add checks on RFLAGS 3436 */ 3437 3438 return true; 3439 } 3440 3441 static int init_rmode_tss(struct kvm *kvm) 3442 { 3443 gfn_t fn; 3444 u16 data = 0; 3445 int idx, r; 3446 3447 idx = srcu_read_lock(&kvm->srcu); 3448 fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; 3449 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 3450 if (r < 0) 3451 goto out; 3452 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3453 r = kvm_write_guest_page(kvm, fn++, &data, 3454 TSS_IOPB_BASE_OFFSET, sizeof(u16)); 3455 if (r < 0) 3456 goto out; 3457 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 3458 if (r < 0) 3459 goto out; 3460 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 3461 if (r < 0) 3462 goto out; 3463 data = ~0; 3464 r = kvm_write_guest_page(kvm, fn, &data, 3465 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, 3466 sizeof(u8)); 3467 out: 3468 srcu_read_unlock(&kvm->srcu, idx); 3469 return r; 3470 } 3471 3472 static int init_rmode_identity_map(struct kvm *kvm) 3473 { 3474 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3475 int i, idx, r = 0; 3476 kvm_pfn_t identity_map_pfn; 3477 u32 tmp; 3478 3479 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3480 mutex_lock(&kvm->slots_lock); 3481 3482 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3483 goto out2; 3484 3485 if (!kvm_vmx->ept_identity_map_addr) 3486 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3487 identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; 3488 3489 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3490 kvm_vmx->ept_identity_map_addr, PAGE_SIZE); 3491 if (r < 0) 3492 goto out2; 3493 3494 idx = srcu_read_lock(&kvm->srcu); 3495 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 3496 if (r < 0) 3497 goto out; 3498 /* Set up identity-mapping pagetable for EPT in real mode */ 3499 for (i = 0; i < PT32_ENT_PER_PAGE; i++) { 3500 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3501 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3502 r = kvm_write_guest_page(kvm, identity_map_pfn, 3503 &tmp, i * sizeof(tmp), sizeof(tmp)); 3504 if (r < 0) 3505 goto out; 3506 } 3507 kvm_vmx->ept_identity_pagetable_done = true; 3508 3509 out: 3510 srcu_read_unlock(&kvm->srcu, idx); 3511 3512 out2: 3513 mutex_unlock(&kvm->slots_lock); 3514 return r; 3515 } 3516 3517 static void seg_setup(int seg) 3518 { 3519 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3520 unsigned int ar; 3521 3522 vmcs_write16(sf->selector, 0); 3523 vmcs_writel(sf->base, 0); 3524 vmcs_write32(sf->limit, 0xffff); 3525 ar = 0x93; 3526 if (seg == VCPU_SREG_CS) 3527 ar |= 0x08; /* code segment */ 3528 3529 vmcs_write32(sf->ar_bytes, ar); 3530 } 3531 3532 static int alloc_apic_access_page(struct kvm *kvm) 3533 { 3534 struct page *page; 3535 int r = 0; 3536 3537 mutex_lock(&kvm->slots_lock); 3538 if (kvm->arch.apic_access_page_done) 3539 goto out; 3540 r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 3541 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); 3542 if (r) 3543 goto out; 3544 3545 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); 3546 if (is_error_page(page)) { 3547 r = -EFAULT; 3548 goto out; 3549 } 3550 3551 /* 3552 * Do not pin the page in memory, so that memory hot-unplug 3553 * is able to migrate it. 3554 */ 3555 put_page(page); 3556 kvm->arch.apic_access_page_done = true; 3557 out: 3558 mutex_unlock(&kvm->slots_lock); 3559 return r; 3560 } 3561 3562 int allocate_vpid(void) 3563 { 3564 int vpid; 3565 3566 if (!enable_vpid) 3567 return 0; 3568 spin_lock(&vmx_vpid_lock); 3569 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3570 if (vpid < VMX_NR_VPIDS) 3571 __set_bit(vpid, vmx_vpid_bitmap); 3572 else 3573 vpid = 0; 3574 spin_unlock(&vmx_vpid_lock); 3575 return vpid; 3576 } 3577 3578 void free_vpid(int vpid) 3579 { 3580 if (!enable_vpid || vpid == 0) 3581 return; 3582 spin_lock(&vmx_vpid_lock); 3583 __clear_bit(vpid, vmx_vpid_bitmap); 3584 spin_unlock(&vmx_vpid_lock); 3585 } 3586 3587 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, 3588 u32 msr, int type) 3589 { 3590 int f = sizeof(unsigned long); 3591 3592 if (!cpu_has_vmx_msr_bitmap()) 3593 return; 3594 3595 if (static_branch_unlikely(&enable_evmcs)) 3596 evmcs_touch_msr_bitmap(); 3597 3598 /* 3599 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 3600 * have the write-low and read-high bitmap offsets the wrong way round. 3601 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3602 */ 3603 if (msr <= 0x1fff) { 3604 if (type & MSR_TYPE_R) 3605 /* read-low */ 3606 __clear_bit(msr, msr_bitmap + 0x000 / f); 3607 3608 if (type & MSR_TYPE_W) 3609 /* write-low */ 3610 __clear_bit(msr, msr_bitmap + 0x800 / f); 3611 3612 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3613 msr &= 0x1fff; 3614 if (type & MSR_TYPE_R) 3615 /* read-high */ 3616 __clear_bit(msr, msr_bitmap + 0x400 / f); 3617 3618 if (type & MSR_TYPE_W) 3619 /* write-high */ 3620 __clear_bit(msr, msr_bitmap + 0xc00 / f); 3621 3622 } 3623 } 3624 3625 static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, 3626 u32 msr, int type) 3627 { 3628 int f = sizeof(unsigned long); 3629 3630 if (!cpu_has_vmx_msr_bitmap()) 3631 return; 3632 3633 if (static_branch_unlikely(&enable_evmcs)) 3634 evmcs_touch_msr_bitmap(); 3635 3636 /* 3637 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 3638 * have the write-low and read-high bitmap offsets the wrong way round. 3639 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3640 */ 3641 if (msr <= 0x1fff) { 3642 if (type & MSR_TYPE_R) 3643 /* read-low */ 3644 __set_bit(msr, msr_bitmap + 0x000 / f); 3645 3646 if (type & MSR_TYPE_W) 3647 /* write-low */ 3648 __set_bit(msr, msr_bitmap + 0x800 / f); 3649 3650 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3651 msr &= 0x1fff; 3652 if (type & MSR_TYPE_R) 3653 /* read-high */ 3654 __set_bit(msr, msr_bitmap + 0x400 / f); 3655 3656 if (type & MSR_TYPE_W) 3657 /* write-high */ 3658 __set_bit(msr, msr_bitmap + 0xc00 / f); 3659 3660 } 3661 } 3662 3663 static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, 3664 u32 msr, int type, bool value) 3665 { 3666 if (value) 3667 vmx_enable_intercept_for_msr(msr_bitmap, msr, type); 3668 else 3669 vmx_disable_intercept_for_msr(msr_bitmap, msr, type); 3670 } 3671 3672 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) 3673 { 3674 u8 mode = 0; 3675 3676 if (cpu_has_secondary_exec_ctrls() && 3677 (secondary_exec_controls_get(to_vmx(vcpu)) & 3678 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 3679 mode |= MSR_BITMAP_MODE_X2APIC; 3680 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 3681 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 3682 } 3683 3684 return mode; 3685 } 3686 3687 static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, 3688 u8 mode) 3689 { 3690 int msr; 3691 3692 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 3693 unsigned word = msr / BITS_PER_LONG; 3694 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; 3695 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 3696 } 3697 3698 if (mode & MSR_BITMAP_MODE_X2APIC) { 3699 /* 3700 * TPR reads and writes can be virtualized even if virtual interrupt 3701 * delivery is not in use. 3702 */ 3703 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); 3704 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 3705 vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); 3706 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 3707 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 3708 } 3709 } 3710 } 3711 3712 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) 3713 { 3714 struct vcpu_vmx *vmx = to_vmx(vcpu); 3715 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3716 u8 mode = vmx_msr_bitmap_mode(vcpu); 3717 u8 changed = mode ^ vmx->msr_bitmap_mode; 3718 3719 if (!changed) 3720 return; 3721 3722 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) 3723 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); 3724 3725 vmx->msr_bitmap_mode = mode; 3726 } 3727 3728 void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) 3729 { 3730 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3731 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 3732 u32 i; 3733 3734 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, 3735 MSR_TYPE_RW, flag); 3736 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, 3737 MSR_TYPE_RW, flag); 3738 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, 3739 MSR_TYPE_RW, flag); 3740 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, 3741 MSR_TYPE_RW, flag); 3742 for (i = 0; i < vmx->pt_desc.addr_range; i++) { 3743 vmx_set_intercept_for_msr(msr_bitmap, 3744 MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 3745 vmx_set_intercept_for_msr(msr_bitmap, 3746 MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 3747 } 3748 } 3749 3750 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) 3751 { 3752 return enable_apicv; 3753 } 3754 3755 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 3756 { 3757 struct vcpu_vmx *vmx = to_vmx(vcpu); 3758 void *vapic_page; 3759 u32 vppr; 3760 int rvi; 3761 3762 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || 3763 !nested_cpu_has_vid(get_vmcs12(vcpu)) || 3764 WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn)) 3765 return false; 3766 3767 rvi = vmx_get_rvi(); 3768 3769 vapic_page = vmx->nested.virtual_apic_map.hva; 3770 vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); 3771 3772 return ((rvi & 0xf0) > (vppr & 0xf0)); 3773 } 3774 3775 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 3776 bool nested) 3777 { 3778 #ifdef CONFIG_SMP 3779 int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; 3780 3781 if (vcpu->mode == IN_GUEST_MODE) { 3782 /* 3783 * The vector of interrupt to be delivered to vcpu had 3784 * been set in PIR before this function. 3785 * 3786 * Following cases will be reached in this block, and 3787 * we always send a notification event in all cases as 3788 * explained below. 3789 * 3790 * Case 1: vcpu keeps in non-root mode. Sending a 3791 * notification event posts the interrupt to vcpu. 3792 * 3793 * Case 2: vcpu exits to root mode and is still 3794 * runnable. PIR will be synced to vIRR before the 3795 * next vcpu entry. Sending a notification event in 3796 * this case has no effect, as vcpu is not in root 3797 * mode. 3798 * 3799 * Case 3: vcpu exits to root mode and is blocked. 3800 * vcpu_block() has already synced PIR to vIRR and 3801 * never blocks vcpu if vIRR is not cleared. Therefore, 3802 * a blocked vcpu here does not wait for any requested 3803 * interrupts in PIR, and sending a notification event 3804 * which has no effect is safe here. 3805 */ 3806 3807 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); 3808 return true; 3809 } 3810 #endif 3811 return false; 3812 } 3813 3814 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 3815 int vector) 3816 { 3817 struct vcpu_vmx *vmx = to_vmx(vcpu); 3818 3819 if (is_guest_mode(vcpu) && 3820 vector == vmx->nested.posted_intr_nv) { 3821 /* 3822 * If a posted intr is not recognized by hardware, 3823 * we will accomplish it in the next vmentry. 3824 */ 3825 vmx->nested.pi_pending = true; 3826 kvm_make_request(KVM_REQ_EVENT, vcpu); 3827 /* the PIR and ON have been set by L1. */ 3828 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) 3829 kvm_vcpu_kick(vcpu); 3830 return 0; 3831 } 3832 return -1; 3833 } 3834 /* 3835 * Send interrupt to vcpu via posted interrupt way. 3836 * 1. If target vcpu is running(non-root mode), send posted interrupt 3837 * notification to vcpu and hardware will sync PIR to vIRR atomically. 3838 * 2. If target vcpu isn't running(root mode), kick it to pick up the 3839 * interrupt from PIR in next vmentry. 3840 */ 3841 static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 3842 { 3843 struct vcpu_vmx *vmx = to_vmx(vcpu); 3844 int r; 3845 3846 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 3847 if (!r) 3848 return; 3849 3850 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 3851 return; 3852 3853 /* If a previous notification has sent the IPI, nothing to do. */ 3854 if (pi_test_and_set_on(&vmx->pi_desc)) 3855 return; 3856 3857 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) 3858 kvm_vcpu_kick(vcpu); 3859 } 3860 3861 /* 3862 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 3863 * will not change in the lifetime of the guest. 3864 * Note that host-state that does change is set elsewhere. E.g., host-state 3865 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 3866 */ 3867 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 3868 { 3869 u32 low32, high32; 3870 unsigned long tmpl; 3871 unsigned long cr0, cr3, cr4; 3872 3873 cr0 = read_cr0(); 3874 WARN_ON(cr0 & X86_CR0_TS); 3875 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 3876 3877 /* 3878 * Save the most likely value for this task's CR3 in the VMCS. 3879 * We can't use __get_current_cr3_fast() because we're not atomic. 3880 */ 3881 cr3 = __read_cr3(); 3882 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 3883 vmx->loaded_vmcs->host_state.cr3 = cr3; 3884 3885 /* Save the most likely value for this task's CR4 in the VMCS. */ 3886 cr4 = cr4_read_shadow(); 3887 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 3888 vmx->loaded_vmcs->host_state.cr4 = cr4; 3889 3890 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 3891 #ifdef CONFIG_X86_64 3892 /* 3893 * Load null selectors, so we can avoid reloading them in 3894 * vmx_prepare_switch_to_host(), in case userspace uses 3895 * the null selectors too (the expected case). 3896 */ 3897 vmcs_write16(HOST_DS_SELECTOR, 0); 3898 vmcs_write16(HOST_ES_SELECTOR, 0); 3899 #else 3900 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3901 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3902 #endif 3903 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3904 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 3905 3906 vmcs_writel(HOST_IDTR_BASE, host_idt_base); /* 22.2.4 */ 3907 3908 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 3909 3910 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 3911 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 3912 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 3913 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 3914 3915 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 3916 rdmsr(MSR_IA32_CR_PAT, low32, high32); 3917 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 3918 } 3919 3920 if (cpu_has_load_ia32_efer()) 3921 vmcs_write64(HOST_IA32_EFER, host_efer); 3922 } 3923 3924 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 3925 { 3926 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 3927 if (enable_ept) 3928 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 3929 if (is_guest_mode(&vmx->vcpu)) 3930 vmx->vcpu.arch.cr4_guest_owned_bits &= 3931 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; 3932 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 3933 } 3934 3935 u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 3936 { 3937 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 3938 3939 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 3940 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 3941 3942 if (!enable_vnmi) 3943 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 3944 3945 if (!enable_preemption_timer) 3946 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 3947 3948 return pin_based_exec_ctrl; 3949 } 3950 3951 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 3952 { 3953 struct vcpu_vmx *vmx = to_vmx(vcpu); 3954 3955 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 3956 if (cpu_has_secondary_exec_ctrls()) { 3957 if (kvm_vcpu_apicv_active(vcpu)) 3958 secondary_exec_controls_setbit(vmx, 3959 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3960 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3961 else 3962 secondary_exec_controls_clearbit(vmx, 3963 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3964 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3965 } 3966 3967 if (cpu_has_vmx_msr_bitmap()) 3968 vmx_update_msr_bitmap(vcpu); 3969 } 3970 3971 u32 vmx_exec_control(struct vcpu_vmx *vmx) 3972 { 3973 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 3974 3975 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 3976 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 3977 3978 if (!cpu_need_tpr_shadow(&vmx->vcpu)) { 3979 exec_control &= ~CPU_BASED_TPR_SHADOW; 3980 #ifdef CONFIG_X86_64 3981 exec_control |= CPU_BASED_CR8_STORE_EXITING | 3982 CPU_BASED_CR8_LOAD_EXITING; 3983 #endif 3984 } 3985 if (!enable_ept) 3986 exec_control |= CPU_BASED_CR3_STORE_EXITING | 3987 CPU_BASED_CR3_LOAD_EXITING | 3988 CPU_BASED_INVLPG_EXITING; 3989 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 3990 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 3991 CPU_BASED_MONITOR_EXITING); 3992 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 3993 exec_control &= ~CPU_BASED_HLT_EXITING; 3994 return exec_control; 3995 } 3996 3997 3998 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) 3999 { 4000 struct kvm_vcpu *vcpu = &vmx->vcpu; 4001 4002 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4003 4004 if (pt_mode == PT_MODE_SYSTEM) 4005 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 4006 if (!cpu_need_virtualize_apic_accesses(vcpu)) 4007 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4008 if (vmx->vpid == 0) 4009 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4010 if (!enable_ept) { 4011 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4012 enable_unrestricted_guest = 0; 4013 } 4014 if (!enable_unrestricted_guest) 4015 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4016 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 4017 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4018 if (!kvm_vcpu_apicv_active(vcpu)) 4019 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4020 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4021 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4022 4023 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 4024 * in vmx_set_cr4. */ 4025 exec_control &= ~SECONDARY_EXEC_DESC; 4026 4027 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4028 (handle_vmptrld). 4029 We can NOT enable shadow_vmcs here because we don't have yet 4030 a current VMCS12 4031 */ 4032 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4033 4034 if (!enable_pml) 4035 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4036 4037 if (vmx_xsaves_supported()) { 4038 /* Exposing XSAVES only when XSAVE is exposed */ 4039 bool xsaves_enabled = 4040 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 4041 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); 4042 4043 if (!xsaves_enabled) 4044 exec_control &= ~SECONDARY_EXEC_XSAVES; 4045 4046 if (nested) { 4047 if (xsaves_enabled) 4048 vmx->nested.msrs.secondary_ctls_high |= 4049 SECONDARY_EXEC_XSAVES; 4050 else 4051 vmx->nested.msrs.secondary_ctls_high &= 4052 ~SECONDARY_EXEC_XSAVES; 4053 } 4054 } 4055 4056 if (vmx_rdtscp_supported()) { 4057 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); 4058 if (!rdtscp_enabled) 4059 exec_control &= ~SECONDARY_EXEC_RDTSCP; 4060 4061 if (nested) { 4062 if (rdtscp_enabled) 4063 vmx->nested.msrs.secondary_ctls_high |= 4064 SECONDARY_EXEC_RDTSCP; 4065 else 4066 vmx->nested.msrs.secondary_ctls_high &= 4067 ~SECONDARY_EXEC_RDTSCP; 4068 } 4069 } 4070 4071 if (vmx_invpcid_supported()) { 4072 /* Exposing INVPCID only when PCID is exposed */ 4073 bool invpcid_enabled = 4074 guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && 4075 guest_cpuid_has(vcpu, X86_FEATURE_PCID); 4076 4077 if (!invpcid_enabled) { 4078 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 4079 guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); 4080 } 4081 4082 if (nested) { 4083 if (invpcid_enabled) 4084 vmx->nested.msrs.secondary_ctls_high |= 4085 SECONDARY_EXEC_ENABLE_INVPCID; 4086 else 4087 vmx->nested.msrs.secondary_ctls_high &= 4088 ~SECONDARY_EXEC_ENABLE_INVPCID; 4089 } 4090 } 4091 4092 if (vmx_rdrand_supported()) { 4093 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); 4094 if (rdrand_enabled) 4095 exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; 4096 4097 if (nested) { 4098 if (rdrand_enabled) 4099 vmx->nested.msrs.secondary_ctls_high |= 4100 SECONDARY_EXEC_RDRAND_EXITING; 4101 else 4102 vmx->nested.msrs.secondary_ctls_high &= 4103 ~SECONDARY_EXEC_RDRAND_EXITING; 4104 } 4105 } 4106 4107 if (vmx_rdseed_supported()) { 4108 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); 4109 if (rdseed_enabled) 4110 exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; 4111 4112 if (nested) { 4113 if (rdseed_enabled) 4114 vmx->nested.msrs.secondary_ctls_high |= 4115 SECONDARY_EXEC_RDSEED_EXITING; 4116 else 4117 vmx->nested.msrs.secondary_ctls_high &= 4118 ~SECONDARY_EXEC_RDSEED_EXITING; 4119 } 4120 } 4121 4122 if (vmx_waitpkg_supported()) { 4123 bool waitpkg_enabled = 4124 guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG); 4125 4126 if (!waitpkg_enabled) 4127 exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 4128 4129 if (nested) { 4130 if (waitpkg_enabled) 4131 vmx->nested.msrs.secondary_ctls_high |= 4132 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 4133 else 4134 vmx->nested.msrs.secondary_ctls_high &= 4135 ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 4136 } 4137 } 4138 4139 vmx->secondary_exec_control = exec_control; 4140 } 4141 4142 static void ept_set_mmio_spte_mask(void) 4143 { 4144 /* 4145 * EPT Misconfigurations can be generated if the value of bits 2:0 4146 * of an EPT paging-structure entry is 110b (write/execute). 4147 */ 4148 kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, 4149 VMX_EPT_MISCONFIG_WX_VALUE, 0); 4150 } 4151 4152 #define VMX_XSS_EXIT_BITMAP 0 4153 4154 /* 4155 * Sets up the vmcs for emulated real mode. 4156 */ 4157 static void vmx_vcpu_setup(struct vcpu_vmx *vmx) 4158 { 4159 int i; 4160 4161 if (nested) 4162 nested_vmx_vcpu_setup(); 4163 4164 if (cpu_has_vmx_msr_bitmap()) 4165 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4166 4167 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4168 4169 /* Control */ 4170 pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); 4171 vmx->hv_deadline_tsc = -1; 4172 4173 exec_controls_set(vmx, vmx_exec_control(vmx)); 4174 4175 if (cpu_has_secondary_exec_ctrls()) { 4176 vmx_compute_secondary_exec_control(vmx); 4177 secondary_exec_controls_set(vmx, vmx->secondary_exec_control); 4178 } 4179 4180 if (kvm_vcpu_apicv_active(&vmx->vcpu)) { 4181 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4182 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4183 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4184 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4185 4186 vmcs_write16(GUEST_INTR_STATUS, 0); 4187 4188 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4189 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4190 } 4191 4192 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { 4193 vmcs_write32(PLE_GAP, ple_gap); 4194 vmx->ple_window = ple_window; 4195 vmx->ple_window_dirty = true; 4196 } 4197 4198 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4199 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4200 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4201 4202 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4203 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4204 vmx_set_constant_host_state(vmx); 4205 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4206 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4207 4208 if (cpu_has_vmx_vmfunc()) 4209 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4210 4211 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4212 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4213 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4214 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4215 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4216 4217 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4218 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4219 4220 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { 4221 u32 index = vmx_msr_index[i]; 4222 u32 data_low, data_high; 4223 int j = vmx->nmsrs; 4224 4225 if (rdmsr_safe(index, &data_low, &data_high) < 0) 4226 continue; 4227 if (wrmsr_safe(index, data_low, data_high) < 0) 4228 continue; 4229 vmx->guest_msrs[j].index = i; 4230 vmx->guest_msrs[j].data = 0; 4231 vmx->guest_msrs[j].mask = -1ull; 4232 ++vmx->nmsrs; 4233 } 4234 4235 vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); 4236 4237 /* 22.2.1, 20.8.1 */ 4238 vm_entry_controls_set(vmx, vmx_vmentry_ctrl()); 4239 4240 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; 4241 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); 4242 4243 set_cr4_guest_host_mask(vmx); 4244 4245 if (vmx_xsaves_supported()) 4246 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4247 4248 if (enable_pml) { 4249 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4250 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 4251 } 4252 4253 if (cpu_has_vmx_encls_vmexit()) 4254 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 4255 4256 if (pt_mode == PT_MODE_HOST_GUEST) { 4257 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4258 /* Bit[6~0] are forced to 1, writes are ignored. */ 4259 vmx->pt_desc.guest.output_mask = 0x7F; 4260 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4261 } 4262 } 4263 4264 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4265 { 4266 struct vcpu_vmx *vmx = to_vmx(vcpu); 4267 struct msr_data apic_base_msr; 4268 u64 cr0; 4269 4270 vmx->rmode.vm86_active = 0; 4271 vmx->spec_ctrl = 0; 4272 4273 vmx->msr_ia32_umwait_control = 0; 4274 4275 vcpu->arch.microcode_version = 0x100000000ULL; 4276 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4277 vmx->hv_deadline_tsc = -1; 4278 kvm_set_cr8(vcpu, 0); 4279 4280 if (!init_event) { 4281 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | 4282 MSR_IA32_APICBASE_ENABLE; 4283 if (kvm_vcpu_is_reset_bsp(vcpu)) 4284 apic_base_msr.data |= MSR_IA32_APICBASE_BSP; 4285 apic_base_msr.host_initiated = true; 4286 kvm_set_apic_base(vcpu, &apic_base_msr); 4287 } 4288 4289 vmx_segment_cache_clear(vmx); 4290 4291 seg_setup(VCPU_SREG_CS); 4292 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4293 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4294 4295 seg_setup(VCPU_SREG_DS); 4296 seg_setup(VCPU_SREG_ES); 4297 seg_setup(VCPU_SREG_FS); 4298 seg_setup(VCPU_SREG_GS); 4299 seg_setup(VCPU_SREG_SS); 4300 4301 vmcs_write16(GUEST_TR_SELECTOR, 0); 4302 vmcs_writel(GUEST_TR_BASE, 0); 4303 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4304 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4305 4306 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4307 vmcs_writel(GUEST_LDTR_BASE, 0); 4308 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4309 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4310 4311 if (!init_event) { 4312 vmcs_write32(GUEST_SYSENTER_CS, 0); 4313 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4314 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4315 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4316 } 4317 4318 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); 4319 kvm_rip_write(vcpu, 0xfff0); 4320 4321 vmcs_writel(GUEST_GDTR_BASE, 0); 4322 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4323 4324 vmcs_writel(GUEST_IDTR_BASE, 0); 4325 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4326 4327 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4328 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4329 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4330 if (kvm_mpx_supported()) 4331 vmcs_write64(GUEST_BNDCFGS, 0); 4332 4333 setup_msrs(vmx); 4334 4335 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4336 4337 if (cpu_has_vmx_tpr_shadow() && !init_event) { 4338 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4339 if (cpu_need_tpr_shadow(vcpu)) 4340 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4341 __pa(vcpu->arch.apic->regs)); 4342 vmcs_write32(TPR_THRESHOLD, 0); 4343 } 4344 4345 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4346 4347 if (vmx->vpid != 0) 4348 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4349 4350 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 4351 vmx->vcpu.arch.cr0 = cr0; 4352 vmx_set_cr0(vcpu, cr0); /* enter rmode */ 4353 vmx_set_cr4(vcpu, 0); 4354 vmx_set_efer(vcpu, 0); 4355 4356 update_exception_bitmap(vcpu); 4357 4358 vpid_sync_context(vmx->vpid); 4359 if (init_event) 4360 vmx_clear_hlt(vcpu); 4361 } 4362 4363 static void enable_irq_window(struct kvm_vcpu *vcpu) 4364 { 4365 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); 4366 } 4367 4368 static void enable_nmi_window(struct kvm_vcpu *vcpu) 4369 { 4370 if (!enable_vnmi || 4371 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4372 enable_irq_window(vcpu); 4373 return; 4374 } 4375 4376 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); 4377 } 4378 4379 static void vmx_inject_irq(struct kvm_vcpu *vcpu) 4380 { 4381 struct vcpu_vmx *vmx = to_vmx(vcpu); 4382 uint32_t intr; 4383 int irq = vcpu->arch.interrupt.nr; 4384 4385 trace_kvm_inj_virq(irq); 4386 4387 ++vcpu->stat.irq_injections; 4388 if (vmx->rmode.vm86_active) { 4389 int inc_eip = 0; 4390 if (vcpu->arch.interrupt.soft) 4391 inc_eip = vcpu->arch.event_exit_inst_len; 4392 kvm_inject_realmode_interrupt(vcpu, irq, inc_eip); 4393 return; 4394 } 4395 intr = irq | INTR_INFO_VALID_MASK; 4396 if (vcpu->arch.interrupt.soft) { 4397 intr |= INTR_TYPE_SOFT_INTR; 4398 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4399 vmx->vcpu.arch.event_exit_inst_len); 4400 } else 4401 intr |= INTR_TYPE_EXT_INTR; 4402 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4403 4404 vmx_clear_hlt(vcpu); 4405 } 4406 4407 static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4408 { 4409 struct vcpu_vmx *vmx = to_vmx(vcpu); 4410 4411 if (!enable_vnmi) { 4412 /* 4413 * Tracking the NMI-blocked state in software is built upon 4414 * finding the next open IRQ window. This, in turn, depends on 4415 * well-behaving guests: They have to keep IRQs disabled at 4416 * least as long as the NMI handler runs. Otherwise we may 4417 * cause NMI nesting, maybe breaking the guest. But as this is 4418 * highly unlikely, we can live with the residual risk. 4419 */ 4420 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 4421 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4422 } 4423 4424 ++vcpu->stat.nmi_injections; 4425 vmx->loaded_vmcs->nmi_known_unmasked = false; 4426 4427 if (vmx->rmode.vm86_active) { 4428 kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0); 4429 return; 4430 } 4431 4432 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4433 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4434 4435 vmx_clear_hlt(vcpu); 4436 } 4437 4438 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4439 { 4440 struct vcpu_vmx *vmx = to_vmx(vcpu); 4441 bool masked; 4442 4443 if (!enable_vnmi) 4444 return vmx->loaded_vmcs->soft_vnmi_blocked; 4445 if (vmx->loaded_vmcs->nmi_known_unmasked) 4446 return false; 4447 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4448 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4449 return masked; 4450 } 4451 4452 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4453 { 4454 struct vcpu_vmx *vmx = to_vmx(vcpu); 4455 4456 if (!enable_vnmi) { 4457 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 4458 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 4459 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4460 } 4461 } else { 4462 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4463 if (masked) 4464 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4465 GUEST_INTR_STATE_NMI); 4466 else 4467 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4468 GUEST_INTR_STATE_NMI); 4469 } 4470 } 4471 4472 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4473 { 4474 if (to_vmx(vcpu)->nested.nested_run_pending) 4475 return 0; 4476 4477 if (!enable_vnmi && 4478 to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 4479 return 0; 4480 4481 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4482 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI 4483 | GUEST_INTR_STATE_NMI)); 4484 } 4485 4486 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4487 { 4488 return (!to_vmx(vcpu)->nested.nested_run_pending && 4489 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 4490 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4491 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 4492 } 4493 4494 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 4495 { 4496 int ret; 4497 4498 if (enable_unrestricted_guest) 4499 return 0; 4500 4501 ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 4502 PAGE_SIZE * 3); 4503 if (ret) 4504 return ret; 4505 to_kvm_vmx(kvm)->tss_addr = addr; 4506 return init_rmode_tss(kvm); 4507 } 4508 4509 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 4510 { 4511 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 4512 return 0; 4513 } 4514 4515 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 4516 { 4517 switch (vec) { 4518 case BP_VECTOR: 4519 /* 4520 * Update instruction length as we may reinject the exception 4521 * from user space while in guest debugging mode. 4522 */ 4523 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 4524 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4525 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 4526 return false; 4527 /* fall through */ 4528 case DB_VECTOR: 4529 if (vcpu->guest_debug & 4530 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 4531 return false; 4532 /* fall through */ 4533 case DE_VECTOR: 4534 case OF_VECTOR: 4535 case BR_VECTOR: 4536 case UD_VECTOR: 4537 case DF_VECTOR: 4538 case SS_VECTOR: 4539 case GP_VECTOR: 4540 case MF_VECTOR: 4541 return true; 4542 break; 4543 } 4544 return false; 4545 } 4546 4547 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 4548 int vec, u32 err_code) 4549 { 4550 /* 4551 * Instruction with address size override prefix opcode 0x67 4552 * Cause the #SS fault with 0 error code in VM86 mode. 4553 */ 4554 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 4555 if (kvm_emulate_instruction(vcpu, 0)) { 4556 if (vcpu->arch.halt_request) { 4557 vcpu->arch.halt_request = 0; 4558 return kvm_vcpu_halt(vcpu); 4559 } 4560 return 1; 4561 } 4562 return 0; 4563 } 4564 4565 /* 4566 * Forward all other exceptions that are valid in real mode. 4567 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 4568 * the required debugging infrastructure rework. 4569 */ 4570 kvm_queue_exception(vcpu, vec); 4571 return 1; 4572 } 4573 4574 /* 4575 * Trigger machine check on the host. We assume all the MSRs are already set up 4576 * by the CPU and that we still run on the same CPU as the MCE occurred on. 4577 * We pass a fake environment to the machine check handler because we want 4578 * the guest to be always treated like user space, no matter what context 4579 * it used internally. 4580 */ 4581 static void kvm_machine_check(void) 4582 { 4583 #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) 4584 struct pt_regs regs = { 4585 .cs = 3, /* Fake ring 3 no matter what the guest ran on */ 4586 .flags = X86_EFLAGS_IF, 4587 }; 4588 4589 do_machine_check(®s, 0); 4590 #endif 4591 } 4592 4593 static int handle_machine_check(struct kvm_vcpu *vcpu) 4594 { 4595 /* handled by vmx_vcpu_run() */ 4596 return 1; 4597 } 4598 4599 static int handle_exception_nmi(struct kvm_vcpu *vcpu) 4600 { 4601 struct vcpu_vmx *vmx = to_vmx(vcpu); 4602 struct kvm_run *kvm_run = vcpu->run; 4603 u32 intr_info, ex_no, error_code; 4604 unsigned long cr2, rip, dr6; 4605 u32 vect_info; 4606 4607 vect_info = vmx->idt_vectoring_info; 4608 intr_info = vmx->exit_intr_info; 4609 4610 if (is_machine_check(intr_info) || is_nmi(intr_info)) 4611 return 1; /* handled by handle_exception_nmi_irqoff() */ 4612 4613 if (is_invalid_opcode(intr_info)) 4614 return handle_ud(vcpu); 4615 4616 error_code = 0; 4617 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 4618 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 4619 4620 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 4621 WARN_ON_ONCE(!enable_vmware_backdoor); 4622 4623 /* 4624 * VMware backdoor emulation on #GP interception only handles 4625 * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero 4626 * error code on #GP. 4627 */ 4628 if (error_code) { 4629 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 4630 return 1; 4631 } 4632 return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP); 4633 } 4634 4635 /* 4636 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 4637 * MMIO, it is better to report an internal error. 4638 * See the comments in vmx_handle_exit. 4639 */ 4640 if ((vect_info & VECTORING_INFO_VALID_MASK) && 4641 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 4642 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4643 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 4644 vcpu->run->internal.ndata = 3; 4645 vcpu->run->internal.data[0] = vect_info; 4646 vcpu->run->internal.data[1] = intr_info; 4647 vcpu->run->internal.data[2] = error_code; 4648 return 0; 4649 } 4650 4651 if (is_page_fault(intr_info)) { 4652 cr2 = vmcs_readl(EXIT_QUALIFICATION); 4653 /* EPT won't cause page fault directly */ 4654 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); 4655 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 4656 } 4657 4658 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 4659 4660 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 4661 return handle_rmode_exception(vcpu, ex_no, error_code); 4662 4663 switch (ex_no) { 4664 case AC_VECTOR: 4665 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 4666 return 1; 4667 case DB_VECTOR: 4668 dr6 = vmcs_readl(EXIT_QUALIFICATION); 4669 if (!(vcpu->guest_debug & 4670 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 4671 vcpu->arch.dr6 &= ~DR_TRAP_BITS; 4672 vcpu->arch.dr6 |= dr6 | DR6_RTM; 4673 if (is_icebp(intr_info)) 4674 WARN_ON(!skip_emulated_instruction(vcpu)); 4675 4676 kvm_queue_exception(vcpu, DB_VECTOR); 4677 return 1; 4678 } 4679 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; 4680 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 4681 /* fall through */ 4682 case BP_VECTOR: 4683 /* 4684 * Update instruction length as we may reinject #BP from 4685 * user space while in guest debugging mode. Reading it for 4686 * #DB as well causes no harm, it is not used in that case. 4687 */ 4688 vmx->vcpu.arch.event_exit_inst_len = 4689 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4690 kvm_run->exit_reason = KVM_EXIT_DEBUG; 4691 rip = kvm_rip_read(vcpu); 4692 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 4693 kvm_run->debug.arch.exception = ex_no; 4694 break; 4695 default: 4696 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 4697 kvm_run->ex.exception = ex_no; 4698 kvm_run->ex.error_code = error_code; 4699 break; 4700 } 4701 return 0; 4702 } 4703 4704 static int handle_external_interrupt(struct kvm_vcpu *vcpu) 4705 { 4706 ++vcpu->stat.irq_exits; 4707 return 1; 4708 } 4709 4710 static int handle_triple_fault(struct kvm_vcpu *vcpu) 4711 { 4712 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4713 vcpu->mmio_needed = 0; 4714 return 0; 4715 } 4716 4717 static int handle_io(struct kvm_vcpu *vcpu) 4718 { 4719 unsigned long exit_qualification; 4720 int size, in, string; 4721 unsigned port; 4722 4723 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4724 string = (exit_qualification & 16) != 0; 4725 4726 ++vcpu->stat.io_exits; 4727 4728 if (string) 4729 return kvm_emulate_instruction(vcpu, 0); 4730 4731 port = exit_qualification >> 16; 4732 size = (exit_qualification & 7) + 1; 4733 in = (exit_qualification & 8) != 0; 4734 4735 return kvm_fast_pio(vcpu, size, port, in); 4736 } 4737 4738 static void 4739 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 4740 { 4741 /* 4742 * Patch in the VMCALL instruction: 4743 */ 4744 hypercall[0] = 0x0f; 4745 hypercall[1] = 0x01; 4746 hypercall[2] = 0xc1; 4747 } 4748 4749 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 4750 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 4751 { 4752 if (is_guest_mode(vcpu)) { 4753 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4754 unsigned long orig_val = val; 4755 4756 /* 4757 * We get here when L2 changed cr0 in a way that did not change 4758 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 4759 * but did change L0 shadowed bits. So we first calculate the 4760 * effective cr0 value that L1 would like to write into the 4761 * hardware. It consists of the L2-owned bits from the new 4762 * value combined with the L1-owned bits from L1's guest_cr0. 4763 */ 4764 val = (val & ~vmcs12->cr0_guest_host_mask) | 4765 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 4766 4767 if (!nested_guest_cr0_valid(vcpu, val)) 4768 return 1; 4769 4770 if (kvm_set_cr0(vcpu, val)) 4771 return 1; 4772 vmcs_writel(CR0_READ_SHADOW, orig_val); 4773 return 0; 4774 } else { 4775 if (to_vmx(vcpu)->nested.vmxon && 4776 !nested_host_cr0_valid(vcpu, val)) 4777 return 1; 4778 4779 return kvm_set_cr0(vcpu, val); 4780 } 4781 } 4782 4783 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 4784 { 4785 if (is_guest_mode(vcpu)) { 4786 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4787 unsigned long orig_val = val; 4788 4789 /* analogously to handle_set_cr0 */ 4790 val = (val & ~vmcs12->cr4_guest_host_mask) | 4791 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 4792 if (kvm_set_cr4(vcpu, val)) 4793 return 1; 4794 vmcs_writel(CR4_READ_SHADOW, orig_val); 4795 return 0; 4796 } else 4797 return kvm_set_cr4(vcpu, val); 4798 } 4799 4800 static int handle_desc(struct kvm_vcpu *vcpu) 4801 { 4802 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); 4803 return kvm_emulate_instruction(vcpu, 0); 4804 } 4805 4806 static int handle_cr(struct kvm_vcpu *vcpu) 4807 { 4808 unsigned long exit_qualification, val; 4809 int cr; 4810 int reg; 4811 int err; 4812 int ret; 4813 4814 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4815 cr = exit_qualification & 15; 4816 reg = (exit_qualification >> 8) & 15; 4817 switch ((exit_qualification >> 4) & 3) { 4818 case 0: /* mov to cr */ 4819 val = kvm_register_readl(vcpu, reg); 4820 trace_kvm_cr_write(cr, val); 4821 switch (cr) { 4822 case 0: 4823 err = handle_set_cr0(vcpu, val); 4824 return kvm_complete_insn_gp(vcpu, err); 4825 case 3: 4826 WARN_ON_ONCE(enable_unrestricted_guest); 4827 err = kvm_set_cr3(vcpu, val); 4828 return kvm_complete_insn_gp(vcpu, err); 4829 case 4: 4830 err = handle_set_cr4(vcpu, val); 4831 return kvm_complete_insn_gp(vcpu, err); 4832 case 8: { 4833 u8 cr8_prev = kvm_get_cr8(vcpu); 4834 u8 cr8 = (u8)val; 4835 err = kvm_set_cr8(vcpu, cr8); 4836 ret = kvm_complete_insn_gp(vcpu, err); 4837 if (lapic_in_kernel(vcpu)) 4838 return ret; 4839 if (cr8_prev <= cr8) 4840 return ret; 4841 /* 4842 * TODO: we might be squashing a 4843 * KVM_GUESTDBG_SINGLESTEP-triggered 4844 * KVM_EXIT_DEBUG here. 4845 */ 4846 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 4847 return 0; 4848 } 4849 } 4850 break; 4851 case 2: /* clts */ 4852 WARN_ONCE(1, "Guest should always own CR0.TS"); 4853 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4854 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 4855 return kvm_skip_emulated_instruction(vcpu); 4856 case 1: /*mov from cr*/ 4857 switch (cr) { 4858 case 3: 4859 WARN_ON_ONCE(enable_unrestricted_guest); 4860 val = kvm_read_cr3(vcpu); 4861 kvm_register_write(vcpu, reg, val); 4862 trace_kvm_cr_read(cr, val); 4863 return kvm_skip_emulated_instruction(vcpu); 4864 case 8: 4865 val = kvm_get_cr8(vcpu); 4866 kvm_register_write(vcpu, reg, val); 4867 trace_kvm_cr_read(cr, val); 4868 return kvm_skip_emulated_instruction(vcpu); 4869 } 4870 break; 4871 case 3: /* lmsw */ 4872 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 4873 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); 4874 kvm_lmsw(vcpu, val); 4875 4876 return kvm_skip_emulated_instruction(vcpu); 4877 default: 4878 break; 4879 } 4880 vcpu->run->exit_reason = 0; 4881 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 4882 (int)(exit_qualification >> 4) & 3, cr); 4883 return 0; 4884 } 4885 4886 static int handle_dr(struct kvm_vcpu *vcpu) 4887 { 4888 unsigned long exit_qualification; 4889 int dr, dr7, reg; 4890 4891 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4892 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 4893 4894 /* First, if DR does not exist, trigger UD */ 4895 if (!kvm_require_dr(vcpu, dr)) 4896 return 1; 4897 4898 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 4899 if (!kvm_require_cpl(vcpu, 0)) 4900 return 1; 4901 dr7 = vmcs_readl(GUEST_DR7); 4902 if (dr7 & DR7_GD) { 4903 /* 4904 * As the vm-exit takes precedence over the debug trap, we 4905 * need to emulate the latter, either for the host or the 4906 * guest debugging itself. 4907 */ 4908 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 4909 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; 4910 vcpu->run->debug.arch.dr7 = dr7; 4911 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 4912 vcpu->run->debug.arch.exception = DB_VECTOR; 4913 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 4914 return 0; 4915 } else { 4916 vcpu->arch.dr6 &= ~DR_TRAP_BITS; 4917 vcpu->arch.dr6 |= DR6_BD | DR6_RTM; 4918 kvm_queue_exception(vcpu, DB_VECTOR); 4919 return 1; 4920 } 4921 } 4922 4923 if (vcpu->guest_debug == 0) { 4924 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 4925 4926 /* 4927 * No more DR vmexits; force a reload of the debug registers 4928 * and reenter on this instruction. The next vmexit will 4929 * retrieve the full state of the debug registers. 4930 */ 4931 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 4932 return 1; 4933 } 4934 4935 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 4936 if (exit_qualification & TYPE_MOV_FROM_DR) { 4937 unsigned long val; 4938 4939 if (kvm_get_dr(vcpu, dr, &val)) 4940 return 1; 4941 kvm_register_write(vcpu, reg, val); 4942 } else 4943 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) 4944 return 1; 4945 4946 return kvm_skip_emulated_instruction(vcpu); 4947 } 4948 4949 static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) 4950 { 4951 return vcpu->arch.dr6; 4952 } 4953 4954 static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 4955 { 4956 } 4957 4958 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 4959 { 4960 get_debugreg(vcpu->arch.db[0], 0); 4961 get_debugreg(vcpu->arch.db[1], 1); 4962 get_debugreg(vcpu->arch.db[2], 2); 4963 get_debugreg(vcpu->arch.db[3], 3); 4964 get_debugreg(vcpu->arch.dr6, 6); 4965 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 4966 4967 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 4968 exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); 4969 } 4970 4971 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 4972 { 4973 vmcs_writel(GUEST_DR7, val); 4974 } 4975 4976 static int handle_cpuid(struct kvm_vcpu *vcpu) 4977 { 4978 return kvm_emulate_cpuid(vcpu); 4979 } 4980 4981 static int handle_rdmsr(struct kvm_vcpu *vcpu) 4982 { 4983 return kvm_emulate_rdmsr(vcpu); 4984 } 4985 4986 static int handle_wrmsr(struct kvm_vcpu *vcpu) 4987 { 4988 return kvm_emulate_wrmsr(vcpu); 4989 } 4990 4991 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 4992 { 4993 kvm_apic_update_ppr(vcpu); 4994 return 1; 4995 } 4996 4997 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 4998 { 4999 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING); 5000 5001 kvm_make_request(KVM_REQ_EVENT, vcpu); 5002 5003 ++vcpu->stat.irq_window_exits; 5004 return 1; 5005 } 5006 5007 static int handle_halt(struct kvm_vcpu *vcpu) 5008 { 5009 return kvm_emulate_halt(vcpu); 5010 } 5011 5012 static int handle_vmcall(struct kvm_vcpu *vcpu) 5013 { 5014 return kvm_emulate_hypercall(vcpu); 5015 } 5016 5017 static int handle_invd(struct kvm_vcpu *vcpu) 5018 { 5019 return kvm_emulate_instruction(vcpu, 0); 5020 } 5021 5022 static int handle_invlpg(struct kvm_vcpu *vcpu) 5023 { 5024 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5025 5026 kvm_mmu_invlpg(vcpu, exit_qualification); 5027 return kvm_skip_emulated_instruction(vcpu); 5028 } 5029 5030 static int handle_rdpmc(struct kvm_vcpu *vcpu) 5031 { 5032 int err; 5033 5034 err = kvm_rdpmc(vcpu); 5035 return kvm_complete_insn_gp(vcpu, err); 5036 } 5037 5038 static int handle_wbinvd(struct kvm_vcpu *vcpu) 5039 { 5040 return kvm_emulate_wbinvd(vcpu); 5041 } 5042 5043 static int handle_xsetbv(struct kvm_vcpu *vcpu) 5044 { 5045 u64 new_bv = kvm_read_edx_eax(vcpu); 5046 u32 index = kvm_rcx_read(vcpu); 5047 5048 if (kvm_set_xcr(vcpu, index, new_bv) == 0) 5049 return kvm_skip_emulated_instruction(vcpu); 5050 return 1; 5051 } 5052 5053 static int handle_apic_access(struct kvm_vcpu *vcpu) 5054 { 5055 if (likely(fasteoi)) { 5056 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5057 int access_type, offset; 5058 5059 access_type = exit_qualification & APIC_ACCESS_TYPE; 5060 offset = exit_qualification & APIC_ACCESS_OFFSET; 5061 /* 5062 * Sane guest uses MOV to write EOI, with written value 5063 * not cared. So make a short-circuit here by avoiding 5064 * heavy instruction emulation. 5065 */ 5066 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5067 (offset == APIC_EOI)) { 5068 kvm_lapic_set_eoi(vcpu); 5069 return kvm_skip_emulated_instruction(vcpu); 5070 } 5071 } 5072 return kvm_emulate_instruction(vcpu, 0); 5073 } 5074 5075 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5076 { 5077 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5078 int vector = exit_qualification & 0xff; 5079 5080 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5081 kvm_apic_set_eoi_accelerated(vcpu, vector); 5082 return 1; 5083 } 5084 5085 static int handle_apic_write(struct kvm_vcpu *vcpu) 5086 { 5087 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5088 u32 offset = exit_qualification & 0xfff; 5089 5090 /* APIC-write VM exit is trap-like and thus no need to adjust IP */ 5091 kvm_apic_write_nodecode(vcpu, offset); 5092 return 1; 5093 } 5094 5095 static int handle_task_switch(struct kvm_vcpu *vcpu) 5096 { 5097 struct vcpu_vmx *vmx = to_vmx(vcpu); 5098 unsigned long exit_qualification; 5099 bool has_error_code = false; 5100 u32 error_code = 0; 5101 u16 tss_selector; 5102 int reason, type, idt_v, idt_index; 5103 5104 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5105 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5106 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5107 5108 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5109 5110 reason = (u32)exit_qualification >> 30; 5111 if (reason == TASK_SWITCH_GATE && idt_v) { 5112 switch (type) { 5113 case INTR_TYPE_NMI_INTR: 5114 vcpu->arch.nmi_injected = false; 5115 vmx_set_nmi_mask(vcpu, true); 5116 break; 5117 case INTR_TYPE_EXT_INTR: 5118 case INTR_TYPE_SOFT_INTR: 5119 kvm_clear_interrupt_queue(vcpu); 5120 break; 5121 case INTR_TYPE_HARD_EXCEPTION: 5122 if (vmx->idt_vectoring_info & 5123 VECTORING_INFO_DELIVER_CODE_MASK) { 5124 has_error_code = true; 5125 error_code = 5126 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5127 } 5128 /* fall through */ 5129 case INTR_TYPE_SOFT_EXCEPTION: 5130 kvm_clear_exception_queue(vcpu); 5131 break; 5132 default: 5133 break; 5134 } 5135 } 5136 tss_selector = exit_qualification; 5137 5138 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5139 type != INTR_TYPE_EXT_INTR && 5140 type != INTR_TYPE_NMI_INTR)) 5141 WARN_ON(!skip_emulated_instruction(vcpu)); 5142 5143 /* 5144 * TODO: What about debug traps on tss switch? 5145 * Are we supposed to inject them and update dr6? 5146 */ 5147 return kvm_task_switch(vcpu, tss_selector, 5148 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, 5149 reason, has_error_code, error_code); 5150 } 5151 5152 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5153 { 5154 unsigned long exit_qualification; 5155 gpa_t gpa; 5156 u64 error_code; 5157 5158 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5159 5160 /* 5161 * EPT violation happened while executing iret from NMI, 5162 * "blocked by NMI" bit has to be set before next VM entry. 5163 * There are errata that may cause this bit to not be set: 5164 * AAK134, BY25. 5165 */ 5166 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5167 enable_vnmi && 5168 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5169 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5170 5171 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5172 trace_kvm_page_fault(gpa, exit_qualification); 5173 5174 /* Is it a read fault? */ 5175 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) 5176 ? PFERR_USER_MASK : 0; 5177 /* Is it a write fault? */ 5178 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) 5179 ? PFERR_WRITE_MASK : 0; 5180 /* Is it a fetch fault? */ 5181 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) 5182 ? PFERR_FETCH_MASK : 0; 5183 /* ept page table entry is present? */ 5184 error_code |= (exit_qualification & 5185 (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | 5186 EPT_VIOLATION_EXECUTABLE)) 5187 ? PFERR_PRESENT_MASK : 0; 5188 5189 error_code |= (exit_qualification & 0x100) != 0 ? 5190 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5191 5192 vcpu->arch.exit_qualification = exit_qualification; 5193 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5194 } 5195 5196 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5197 { 5198 gpa_t gpa; 5199 5200 /* 5201 * A nested guest cannot optimize MMIO vmexits, because we have an 5202 * nGPA here instead of the required GPA. 5203 */ 5204 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5205 if (!is_guest_mode(vcpu) && 5206 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5207 trace_kvm_fast_mmio(gpa); 5208 return kvm_skip_emulated_instruction(vcpu); 5209 } 5210 5211 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5212 } 5213 5214 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5215 { 5216 WARN_ON_ONCE(!enable_vnmi); 5217 exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING); 5218 ++vcpu->stat.nmi_window_exits; 5219 kvm_make_request(KVM_REQ_EVENT, vcpu); 5220 5221 return 1; 5222 } 5223 5224 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5225 { 5226 struct vcpu_vmx *vmx = to_vmx(vcpu); 5227 bool intr_window_requested; 5228 unsigned count = 130; 5229 5230 /* 5231 * We should never reach the point where we are emulating L2 5232 * due to invalid guest state as that means we incorrectly 5233 * allowed a nested VMEntry with an invalid vmcs12. 5234 */ 5235 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); 5236 5237 intr_window_requested = exec_controls_get(vmx) & 5238 CPU_BASED_VIRTUAL_INTR_PENDING; 5239 5240 while (vmx->emulation_required && count-- != 0) { 5241 if (intr_window_requested && vmx_interrupt_allowed(vcpu)) 5242 return handle_interrupt_window(&vmx->vcpu); 5243 5244 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5245 return 1; 5246 5247 if (!kvm_emulate_instruction(vcpu, 0)) 5248 return 0; 5249 5250 if (vmx->emulation_required && !vmx->rmode.vm86_active && 5251 vcpu->arch.exception.pending) { 5252 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5253 vcpu->run->internal.suberror = 5254 KVM_INTERNAL_ERROR_EMULATION; 5255 vcpu->run->internal.ndata = 0; 5256 return 0; 5257 } 5258 5259 if (vcpu->arch.halt_request) { 5260 vcpu->arch.halt_request = 0; 5261 return kvm_vcpu_halt(vcpu); 5262 } 5263 5264 /* 5265 * Note, return 1 and not 0, vcpu_run() is responsible for 5266 * morphing the pending signal into the proper return code. 5267 */ 5268 if (signal_pending(current)) 5269 return 1; 5270 5271 if (need_resched()) 5272 schedule(); 5273 } 5274 5275 return 1; 5276 } 5277 5278 static void grow_ple_window(struct kvm_vcpu *vcpu) 5279 { 5280 struct vcpu_vmx *vmx = to_vmx(vcpu); 5281 unsigned int old = vmx->ple_window; 5282 5283 vmx->ple_window = __grow_ple_window(old, ple_window, 5284 ple_window_grow, 5285 ple_window_max); 5286 5287 if (vmx->ple_window != old) { 5288 vmx->ple_window_dirty = true; 5289 trace_kvm_ple_window_update(vcpu->vcpu_id, 5290 vmx->ple_window, old); 5291 } 5292 } 5293 5294 static void shrink_ple_window(struct kvm_vcpu *vcpu) 5295 { 5296 struct vcpu_vmx *vmx = to_vmx(vcpu); 5297 unsigned int old = vmx->ple_window; 5298 5299 vmx->ple_window = __shrink_ple_window(old, ple_window, 5300 ple_window_shrink, 5301 ple_window); 5302 5303 if (vmx->ple_window != old) { 5304 vmx->ple_window_dirty = true; 5305 trace_kvm_ple_window_update(vcpu->vcpu_id, 5306 vmx->ple_window, old); 5307 } 5308 } 5309 5310 /* 5311 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. 5312 */ 5313 static void wakeup_handler(void) 5314 { 5315 struct kvm_vcpu *vcpu; 5316 int cpu = smp_processor_id(); 5317 5318 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 5319 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), 5320 blocked_vcpu_list) { 5321 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 5322 5323 if (pi_test_on(pi_desc) == 1) 5324 kvm_vcpu_kick(vcpu); 5325 } 5326 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 5327 } 5328 5329 static void vmx_enable_tdp(void) 5330 { 5331 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, 5332 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, 5333 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 5334 0ull, VMX_EPT_EXECUTABLE_MASK, 5335 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, 5336 VMX_EPT_RWX_MASK, 0ull); 5337 5338 ept_set_mmio_spte_mask(); 5339 kvm_enable_tdp(); 5340 } 5341 5342 /* 5343 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5344 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5345 */ 5346 static int handle_pause(struct kvm_vcpu *vcpu) 5347 { 5348 if (!kvm_pause_in_guest(vcpu->kvm)) 5349 grow_ple_window(vcpu); 5350 5351 /* 5352 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5353 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5354 * never set PAUSE_EXITING and just set PLE if supported, 5355 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5356 */ 5357 kvm_vcpu_on_spin(vcpu, true); 5358 return kvm_skip_emulated_instruction(vcpu); 5359 } 5360 5361 static int handle_nop(struct kvm_vcpu *vcpu) 5362 { 5363 return kvm_skip_emulated_instruction(vcpu); 5364 } 5365 5366 static int handle_mwait(struct kvm_vcpu *vcpu) 5367 { 5368 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 5369 return handle_nop(vcpu); 5370 } 5371 5372 static int handle_invalid_op(struct kvm_vcpu *vcpu) 5373 { 5374 kvm_queue_exception(vcpu, UD_VECTOR); 5375 return 1; 5376 } 5377 5378 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5379 { 5380 return 1; 5381 } 5382 5383 static int handle_monitor(struct kvm_vcpu *vcpu) 5384 { 5385 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 5386 return handle_nop(vcpu); 5387 } 5388 5389 static int handle_invpcid(struct kvm_vcpu *vcpu) 5390 { 5391 u32 vmx_instruction_info; 5392 unsigned long type; 5393 bool pcid_enabled; 5394 gva_t gva; 5395 struct x86_exception e; 5396 unsigned i; 5397 unsigned long roots_to_free = 0; 5398 struct { 5399 u64 pcid; 5400 u64 gla; 5401 } operand; 5402 5403 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 5404 kvm_queue_exception(vcpu, UD_VECTOR); 5405 return 1; 5406 } 5407 5408 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5409 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 5410 5411 if (type > 3) { 5412 kvm_inject_gp(vcpu, 0); 5413 return 1; 5414 } 5415 5416 /* According to the Intel instruction reference, the memory operand 5417 * is read even if it isn't needed (e.g., for type==all) 5418 */ 5419 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 5420 vmx_instruction_info, false, 5421 sizeof(operand), &gva)) 5422 return 1; 5423 5424 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 5425 kvm_inject_page_fault(vcpu, &e); 5426 return 1; 5427 } 5428 5429 if (operand.pcid >> 12 != 0) { 5430 kvm_inject_gp(vcpu, 0); 5431 return 1; 5432 } 5433 5434 pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); 5435 5436 switch (type) { 5437 case INVPCID_TYPE_INDIV_ADDR: 5438 if ((!pcid_enabled && (operand.pcid != 0)) || 5439 is_noncanonical_address(operand.gla, vcpu)) { 5440 kvm_inject_gp(vcpu, 0); 5441 return 1; 5442 } 5443 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); 5444 return kvm_skip_emulated_instruction(vcpu); 5445 5446 case INVPCID_TYPE_SINGLE_CTXT: 5447 if (!pcid_enabled && (operand.pcid != 0)) { 5448 kvm_inject_gp(vcpu, 0); 5449 return 1; 5450 } 5451 5452 if (kvm_get_active_pcid(vcpu) == operand.pcid) { 5453 kvm_mmu_sync_roots(vcpu); 5454 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 5455 } 5456 5457 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5458 if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) 5459 == operand.pcid) 5460 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5461 5462 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); 5463 /* 5464 * If neither the current cr3 nor any of the prev_roots use the 5465 * given PCID, then nothing needs to be done here because a 5466 * resync will happen anyway before switching to any other CR3. 5467 */ 5468 5469 return kvm_skip_emulated_instruction(vcpu); 5470 5471 case INVPCID_TYPE_ALL_NON_GLOBAL: 5472 /* 5473 * Currently, KVM doesn't mark global entries in the shadow 5474 * page tables, so a non-global flush just degenerates to a 5475 * global flush. If needed, we could optimize this later by 5476 * keeping track of global entries in shadow page tables. 5477 */ 5478 5479 /* fall-through */ 5480 case INVPCID_TYPE_ALL_INCL_GLOBAL: 5481 kvm_mmu_unload(vcpu); 5482 return kvm_skip_emulated_instruction(vcpu); 5483 5484 default: 5485 BUG(); /* We have already checked above that type <= 3 */ 5486 } 5487 } 5488 5489 static int handle_pml_full(struct kvm_vcpu *vcpu) 5490 { 5491 unsigned long exit_qualification; 5492 5493 trace_kvm_pml_full(vcpu->vcpu_id); 5494 5495 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5496 5497 /* 5498 * PML buffer FULL happened while executing iret from NMI, 5499 * "blocked by NMI" bit has to be set before next VM entry. 5500 */ 5501 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5502 enable_vnmi && 5503 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5504 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5505 GUEST_INTR_STATE_NMI); 5506 5507 /* 5508 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 5509 * here.., and there's no userspace involvement needed for PML. 5510 */ 5511 return 1; 5512 } 5513 5514 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 5515 { 5516 struct vcpu_vmx *vmx = to_vmx(vcpu); 5517 5518 if (!vmx->req_immediate_exit && 5519 !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) 5520 kvm_lapic_expired_hv_timer(vcpu); 5521 5522 return 1; 5523 } 5524 5525 /* 5526 * When nested=0, all VMX instruction VM Exits filter here. The handlers 5527 * are overwritten by nested_vmx_setup() when nested=1. 5528 */ 5529 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 5530 { 5531 kvm_queue_exception(vcpu, UD_VECTOR); 5532 return 1; 5533 } 5534 5535 static int handle_encls(struct kvm_vcpu *vcpu) 5536 { 5537 /* 5538 * SGX virtualization is not yet supported. There is no software 5539 * enable bit for SGX, so we have to trap ENCLS and inject a #UD 5540 * to prevent the guest from executing ENCLS. 5541 */ 5542 kvm_queue_exception(vcpu, UD_VECTOR); 5543 return 1; 5544 } 5545 5546 static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu) 5547 { 5548 kvm_skip_emulated_instruction(vcpu); 5549 WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x", 5550 vmcs_read32(VM_EXIT_REASON)); 5551 return 1; 5552 } 5553 5554 /* 5555 * The exit handlers return 1 if the exit was handled fully and guest execution 5556 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5557 * to be done to userspace and return 0. 5558 */ 5559 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 5560 [EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi, 5561 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 5562 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 5563 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 5564 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 5565 [EXIT_REASON_CR_ACCESS] = handle_cr, 5566 [EXIT_REASON_DR_ACCESS] = handle_dr, 5567 [EXIT_REASON_CPUID] = handle_cpuid, 5568 [EXIT_REASON_MSR_READ] = handle_rdmsr, 5569 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 5570 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 5571 [EXIT_REASON_HLT] = handle_halt, 5572 [EXIT_REASON_INVD] = handle_invd, 5573 [EXIT_REASON_INVLPG] = handle_invlpg, 5574 [EXIT_REASON_RDPMC] = handle_rdpmc, 5575 [EXIT_REASON_VMCALL] = handle_vmcall, 5576 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 5577 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 5578 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 5579 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 5580 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 5581 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 5582 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 5583 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 5584 [EXIT_REASON_VMON] = handle_vmx_instruction, 5585 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5586 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5587 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 5588 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 5589 [EXIT_REASON_WBINVD] = handle_wbinvd, 5590 [EXIT_REASON_XSETBV] = handle_xsetbv, 5591 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 5592 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 5593 [EXIT_REASON_GDTR_IDTR] = handle_desc, 5594 [EXIT_REASON_LDTR_TR] = handle_desc, 5595 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 5596 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 5597 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 5598 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, 5599 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 5600 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 5601 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 5602 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 5603 [EXIT_REASON_RDRAND] = handle_invalid_op, 5604 [EXIT_REASON_RDSEED] = handle_invalid_op, 5605 [EXIT_REASON_XSAVES] = handle_unexpected_vmexit, 5606 [EXIT_REASON_XRSTORS] = handle_unexpected_vmexit, 5607 [EXIT_REASON_PML_FULL] = handle_pml_full, 5608 [EXIT_REASON_INVPCID] = handle_invpcid, 5609 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 5610 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 5611 [EXIT_REASON_ENCLS] = handle_encls, 5612 [EXIT_REASON_UMWAIT] = handle_unexpected_vmexit, 5613 [EXIT_REASON_TPAUSE] = handle_unexpected_vmexit, 5614 }; 5615 5616 static const int kvm_vmx_max_exit_handlers = 5617 ARRAY_SIZE(kvm_vmx_exit_handlers); 5618 5619 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 5620 { 5621 *info1 = vmcs_readl(EXIT_QUALIFICATION); 5622 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 5623 } 5624 5625 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 5626 { 5627 if (vmx->pml_pg) { 5628 __free_page(vmx->pml_pg); 5629 vmx->pml_pg = NULL; 5630 } 5631 } 5632 5633 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 5634 { 5635 struct vcpu_vmx *vmx = to_vmx(vcpu); 5636 u64 *pml_buf; 5637 u16 pml_idx; 5638 5639 pml_idx = vmcs_read16(GUEST_PML_INDEX); 5640 5641 /* Do nothing if PML buffer is empty */ 5642 if (pml_idx == (PML_ENTITY_NUM - 1)) 5643 return; 5644 5645 /* PML index always points to next available PML buffer entity */ 5646 if (pml_idx >= PML_ENTITY_NUM) 5647 pml_idx = 0; 5648 else 5649 pml_idx++; 5650 5651 pml_buf = page_address(vmx->pml_pg); 5652 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 5653 u64 gpa; 5654 5655 gpa = pml_buf[pml_idx]; 5656 WARN_ON(gpa & (PAGE_SIZE - 1)); 5657 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 5658 } 5659 5660 /* reset PML index */ 5661 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 5662 } 5663 5664 /* 5665 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. 5666 * Called before reporting dirty_bitmap to userspace. 5667 */ 5668 static void kvm_flush_pml_buffers(struct kvm *kvm) 5669 { 5670 int i; 5671 struct kvm_vcpu *vcpu; 5672 /* 5673 * We only need to kick vcpu out of guest mode here, as PML buffer 5674 * is flushed at beginning of all VMEXITs, and it's obvious that only 5675 * vcpus running in guest are possible to have unflushed GPAs in PML 5676 * buffer. 5677 */ 5678 kvm_for_each_vcpu(i, vcpu, kvm) 5679 kvm_vcpu_kick(vcpu); 5680 } 5681 5682 static void vmx_dump_sel(char *name, uint32_t sel) 5683 { 5684 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 5685 name, vmcs_read16(sel), 5686 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 5687 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 5688 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 5689 } 5690 5691 static void vmx_dump_dtsel(char *name, uint32_t limit) 5692 { 5693 pr_err("%s limit=0x%08x, base=0x%016lx\n", 5694 name, vmcs_read32(limit), 5695 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 5696 } 5697 5698 void dump_vmcs(void) 5699 { 5700 u32 vmentry_ctl, vmexit_ctl; 5701 u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control; 5702 unsigned long cr4; 5703 u64 efer; 5704 int i, n; 5705 5706 if (!dump_invalid_vmcs) { 5707 pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n"); 5708 return; 5709 } 5710 5711 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 5712 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 5713 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5714 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 5715 cr4 = vmcs_readl(GUEST_CR4); 5716 efer = vmcs_read64(GUEST_IA32_EFER); 5717 secondary_exec_control = 0; 5718 if (cpu_has_secondary_exec_ctrls()) 5719 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 5720 5721 pr_err("*** Guest State ***\n"); 5722 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 5723 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 5724 vmcs_readl(CR0_GUEST_HOST_MASK)); 5725 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 5726 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 5727 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 5728 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 5729 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) 5730 { 5731 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 5732 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 5733 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 5734 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 5735 } 5736 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 5737 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 5738 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 5739 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 5740 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 5741 vmcs_readl(GUEST_SYSENTER_ESP), 5742 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 5743 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 5744 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 5745 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 5746 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 5747 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 5748 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 5749 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 5750 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 5751 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 5752 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 5753 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || 5754 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) 5755 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", 5756 efer, vmcs_read64(GUEST_IA32_PAT)); 5757 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 5758 vmcs_read64(GUEST_IA32_DEBUGCTL), 5759 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 5760 if (cpu_has_load_perf_global_ctrl() && 5761 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 5762 pr_err("PerfGlobCtl = 0x%016llx\n", 5763 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 5764 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 5765 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 5766 pr_err("Interruptibility = %08x ActivityState = %08x\n", 5767 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 5768 vmcs_read32(GUEST_ACTIVITY_STATE)); 5769 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 5770 pr_err("InterruptStatus = %04x\n", 5771 vmcs_read16(GUEST_INTR_STATUS)); 5772 5773 pr_err("*** Host State ***\n"); 5774 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 5775 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 5776 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 5777 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 5778 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 5779 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 5780 vmcs_read16(HOST_TR_SELECTOR)); 5781 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 5782 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 5783 vmcs_readl(HOST_TR_BASE)); 5784 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 5785 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 5786 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 5787 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 5788 vmcs_readl(HOST_CR4)); 5789 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 5790 vmcs_readl(HOST_IA32_SYSENTER_ESP), 5791 vmcs_read32(HOST_IA32_SYSENTER_CS), 5792 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 5793 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) 5794 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", 5795 vmcs_read64(HOST_IA32_EFER), 5796 vmcs_read64(HOST_IA32_PAT)); 5797 if (cpu_has_load_perf_global_ctrl() && 5798 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 5799 pr_err("PerfGlobCtl = 0x%016llx\n", 5800 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 5801 5802 pr_err("*** Control State ***\n"); 5803 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", 5804 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); 5805 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); 5806 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 5807 vmcs_read32(EXCEPTION_BITMAP), 5808 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 5809 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 5810 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 5811 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 5812 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 5813 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 5814 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 5815 vmcs_read32(VM_EXIT_INTR_INFO), 5816 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 5817 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 5818 pr_err(" reason=%08x qualification=%016lx\n", 5819 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 5820 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 5821 vmcs_read32(IDT_VECTORING_INFO_FIELD), 5822 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 5823 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 5824 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 5825 pr_err("TSC Multiplier = 0x%016llx\n", 5826 vmcs_read64(TSC_MULTIPLIER)); 5827 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) { 5828 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 5829 u16 status = vmcs_read16(GUEST_INTR_STATUS); 5830 pr_err("SVI|RVI = %02x|%02x ", status >> 8, status & 0xff); 5831 } 5832 pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 5833 if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) 5834 pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR)); 5835 pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR)); 5836 } 5837 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 5838 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 5839 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 5840 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 5841 n = vmcs_read32(CR3_TARGET_COUNT); 5842 for (i = 0; i + 1 < n; i += 4) 5843 pr_err("CR3 target%u=%016lx target%u=%016lx\n", 5844 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), 5845 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); 5846 if (i < n) 5847 pr_err("CR3 target%u=%016lx\n", 5848 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); 5849 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 5850 pr_err("PLE Gap=%08x Window=%08x\n", 5851 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 5852 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 5853 pr_err("Virtual processor ID = 0x%04x\n", 5854 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 5855 } 5856 5857 /* 5858 * The guest has exited. See if we can fix it or if we need userspace 5859 * assistance. 5860 */ 5861 static int vmx_handle_exit(struct kvm_vcpu *vcpu) 5862 { 5863 struct vcpu_vmx *vmx = to_vmx(vcpu); 5864 u32 exit_reason = vmx->exit_reason; 5865 u32 vectoring_info = vmx->idt_vectoring_info; 5866 5867 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); 5868 5869 /* 5870 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 5871 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 5872 * querying dirty_bitmap, we only need to kick all vcpus out of guest 5873 * mode as if vcpus is in root mode, the PML buffer must has been 5874 * flushed already. 5875 */ 5876 if (enable_pml) 5877 vmx_flush_pml_buffer(vcpu); 5878 5879 /* If guest state is invalid, start emulating */ 5880 if (vmx->emulation_required) 5881 return handle_invalid_guest_state(vcpu); 5882 5883 if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) 5884 return nested_vmx_reflect_vmexit(vcpu, exit_reason); 5885 5886 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 5887 dump_vmcs(); 5888 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 5889 vcpu->run->fail_entry.hardware_entry_failure_reason 5890 = exit_reason; 5891 return 0; 5892 } 5893 5894 if (unlikely(vmx->fail)) { 5895 dump_vmcs(); 5896 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 5897 vcpu->run->fail_entry.hardware_entry_failure_reason 5898 = vmcs_read32(VM_INSTRUCTION_ERROR); 5899 return 0; 5900 } 5901 5902 /* 5903 * Note: 5904 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 5905 * delivery event since it indicates guest is accessing MMIO. 5906 * The vm-exit can be triggered again after return to guest that 5907 * will cause infinite loop. 5908 */ 5909 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 5910 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 5911 exit_reason != EXIT_REASON_EPT_VIOLATION && 5912 exit_reason != EXIT_REASON_PML_FULL && 5913 exit_reason != EXIT_REASON_TASK_SWITCH)) { 5914 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5915 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 5916 vcpu->run->internal.ndata = 3; 5917 vcpu->run->internal.data[0] = vectoring_info; 5918 vcpu->run->internal.data[1] = exit_reason; 5919 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; 5920 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { 5921 vcpu->run->internal.ndata++; 5922 vcpu->run->internal.data[3] = 5923 vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5924 } 5925 return 0; 5926 } 5927 5928 if (unlikely(!enable_vnmi && 5929 vmx->loaded_vmcs->soft_vnmi_blocked)) { 5930 if (vmx_interrupt_allowed(vcpu)) { 5931 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 5932 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 5933 vcpu->arch.nmi_pending) { 5934 /* 5935 * This CPU don't support us in finding the end of an 5936 * NMI-blocked window if the guest runs with IRQs 5937 * disabled. So we pull the trigger after 1 s of 5938 * futile waiting, but inform the user about this. 5939 */ 5940 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 5941 "state on VCPU %d after 1 s timeout\n", 5942 __func__, vcpu->vcpu_id); 5943 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 5944 } 5945 } 5946 5947 if (exit_reason < kvm_vmx_max_exit_handlers 5948 && kvm_vmx_exit_handlers[exit_reason]) 5949 return kvm_vmx_exit_handlers[exit_reason](vcpu); 5950 else { 5951 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 5952 exit_reason); 5953 dump_vmcs(); 5954 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5955 vcpu->run->internal.suberror = 5956 KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 5957 vcpu->run->internal.ndata = 1; 5958 vcpu->run->internal.data[0] = exit_reason; 5959 return 0; 5960 } 5961 } 5962 5963 /* 5964 * Software based L1D cache flush which is used when microcode providing 5965 * the cache control MSR is not loaded. 5966 * 5967 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 5968 * flush it is required to read in 64 KiB because the replacement algorithm 5969 * is not exactly LRU. This could be sized at runtime via topology 5970 * information but as all relevant affected CPUs have 32KiB L1D cache size 5971 * there is no point in doing so. 5972 */ 5973 static void vmx_l1d_flush(struct kvm_vcpu *vcpu) 5974 { 5975 int size = PAGE_SIZE << L1D_CACHE_ORDER; 5976 5977 /* 5978 * This code is only executed when the the flush mode is 'cond' or 5979 * 'always' 5980 */ 5981 if (static_branch_likely(&vmx_l1d_flush_cond)) { 5982 bool flush_l1d; 5983 5984 /* 5985 * Clear the per-vcpu flush bit, it gets set again 5986 * either from vcpu_run() or from one of the unsafe 5987 * VMEXIT handlers. 5988 */ 5989 flush_l1d = vcpu->arch.l1tf_flush_l1d; 5990 vcpu->arch.l1tf_flush_l1d = false; 5991 5992 /* 5993 * Clear the per-cpu flush bit, it gets set again from 5994 * the interrupt handlers. 5995 */ 5996 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 5997 kvm_clear_cpu_l1tf_flush_l1d(); 5998 5999 if (!flush_l1d) 6000 return; 6001 } 6002 6003 vcpu->stat.l1d_flush++; 6004 6005 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 6006 wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 6007 return; 6008 } 6009 6010 asm volatile( 6011 /* First ensure the pages are in the TLB */ 6012 "xorl %%eax, %%eax\n" 6013 ".Lpopulate_tlb:\n\t" 6014 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6015 "addl $4096, %%eax\n\t" 6016 "cmpl %%eax, %[size]\n\t" 6017 "jne .Lpopulate_tlb\n\t" 6018 "xorl %%eax, %%eax\n\t" 6019 "cpuid\n\t" 6020 /* Now fill the cache */ 6021 "xorl %%eax, %%eax\n" 6022 ".Lfill_cache:\n" 6023 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 6024 "addl $64, %%eax\n\t" 6025 "cmpl %%eax, %[size]\n\t" 6026 "jne .Lfill_cache\n\t" 6027 "lfence\n" 6028 :: [flush_pages] "r" (vmx_l1d_flush_pages), 6029 [size] "r" (size) 6030 : "eax", "ebx", "ecx", "edx"); 6031 } 6032 6033 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 6034 { 6035 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6036 6037 if (is_guest_mode(vcpu) && 6038 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 6039 return; 6040 6041 if (irr == -1 || tpr < irr) { 6042 vmcs_write32(TPR_THRESHOLD, 0); 6043 return; 6044 } 6045 6046 vmcs_write32(TPR_THRESHOLD, irr); 6047 } 6048 6049 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 6050 { 6051 struct vcpu_vmx *vmx = to_vmx(vcpu); 6052 u32 sec_exec_control; 6053 6054 if (!lapic_in_kernel(vcpu)) 6055 return; 6056 6057 if (!flexpriority_enabled && 6058 !cpu_has_vmx_virtualize_x2apic_mode()) 6059 return; 6060 6061 /* Postpone execution until vmcs01 is the current VMCS. */ 6062 if (is_guest_mode(vcpu)) { 6063 vmx->nested.change_vmcs01_virtual_apic_mode = true; 6064 return; 6065 } 6066 6067 sec_exec_control = secondary_exec_controls_get(vmx); 6068 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6069 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 6070 6071 switch (kvm_get_apic_mode(vcpu)) { 6072 case LAPIC_MODE_INVALID: 6073 WARN_ONCE(true, "Invalid local APIC state"); 6074 case LAPIC_MODE_DISABLED: 6075 break; 6076 case LAPIC_MODE_XAPIC: 6077 if (flexpriority_enabled) { 6078 sec_exec_control |= 6079 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6080 vmx_flush_tlb(vcpu, true); 6081 } 6082 break; 6083 case LAPIC_MODE_X2APIC: 6084 if (cpu_has_vmx_virtualize_x2apic_mode()) 6085 sec_exec_control |= 6086 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 6087 break; 6088 } 6089 secondary_exec_controls_set(vmx, sec_exec_control); 6090 6091 vmx_update_msr_bitmap(vcpu); 6092 } 6093 6094 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) 6095 { 6096 if (!is_guest_mode(vcpu)) { 6097 vmcs_write64(APIC_ACCESS_ADDR, hpa); 6098 vmx_flush_tlb(vcpu, true); 6099 } 6100 } 6101 6102 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 6103 { 6104 u16 status; 6105 u8 old; 6106 6107 if (max_isr == -1) 6108 max_isr = 0; 6109 6110 status = vmcs_read16(GUEST_INTR_STATUS); 6111 old = status >> 8; 6112 if (max_isr != old) { 6113 status &= 0xff; 6114 status |= max_isr << 8; 6115 vmcs_write16(GUEST_INTR_STATUS, status); 6116 } 6117 } 6118 6119 static void vmx_set_rvi(int vector) 6120 { 6121 u16 status; 6122 u8 old; 6123 6124 if (vector == -1) 6125 vector = 0; 6126 6127 status = vmcs_read16(GUEST_INTR_STATUS); 6128 old = (u8)status & 0xff; 6129 if ((u8)vector != old) { 6130 status &= ~0xff; 6131 status |= (u8)vector; 6132 vmcs_write16(GUEST_INTR_STATUS, status); 6133 } 6134 } 6135 6136 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 6137 { 6138 /* 6139 * When running L2, updating RVI is only relevant when 6140 * vmcs12 virtual-interrupt-delivery enabled. 6141 * However, it can be enabled only when L1 also 6142 * intercepts external-interrupts and in that case 6143 * we should not update vmcs02 RVI but instead intercept 6144 * interrupt. Therefore, do nothing when running L2. 6145 */ 6146 if (!is_guest_mode(vcpu)) 6147 vmx_set_rvi(max_irr); 6148 } 6149 6150 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6151 { 6152 struct vcpu_vmx *vmx = to_vmx(vcpu); 6153 int max_irr; 6154 bool max_irr_updated; 6155 6156 WARN_ON(!vcpu->arch.apicv_active); 6157 if (pi_test_on(&vmx->pi_desc)) { 6158 pi_clear_on(&vmx->pi_desc); 6159 /* 6160 * IOMMU can write to PIR.ON, so the barrier matters even on UP. 6161 * But on x86 this is just a compiler barrier anyway. 6162 */ 6163 smp_mb__after_atomic(); 6164 max_irr_updated = 6165 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); 6166 6167 /* 6168 * If we are running L2 and L1 has a new pending interrupt 6169 * which can be injected, we should re-evaluate 6170 * what should be done with this new L1 interrupt. 6171 * If L1 intercepts external-interrupts, we should 6172 * exit from L2 to L1. Otherwise, interrupt should be 6173 * delivered directly to L2. 6174 */ 6175 if (is_guest_mode(vcpu) && max_irr_updated) { 6176 if (nested_exit_on_intr(vcpu)) 6177 kvm_vcpu_exiting_guest_mode(vcpu); 6178 else 6179 kvm_make_request(KVM_REQ_EVENT, vcpu); 6180 } 6181 } else { 6182 max_irr = kvm_lapic_find_highest_irr(vcpu); 6183 } 6184 vmx_hwapic_irr_update(vcpu, max_irr); 6185 return max_irr; 6186 } 6187 6188 static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) 6189 { 6190 return pi_test_on(vcpu_to_pi_desc(vcpu)); 6191 } 6192 6193 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6194 { 6195 if (!kvm_vcpu_apicv_active(vcpu)) 6196 return; 6197 6198 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6199 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6200 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6201 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6202 } 6203 6204 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) 6205 { 6206 struct vcpu_vmx *vmx = to_vmx(vcpu); 6207 6208 pi_clear_on(&vmx->pi_desc); 6209 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); 6210 } 6211 6212 static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) 6213 { 6214 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6215 6216 /* if exit due to PF check for async PF */ 6217 if (is_page_fault(vmx->exit_intr_info)) 6218 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); 6219 6220 /* Handle machine checks before interrupts are enabled */ 6221 if (is_machine_check(vmx->exit_intr_info)) 6222 kvm_machine_check(); 6223 6224 /* We need to handle NMIs before interrupts are enabled */ 6225 if (is_nmi(vmx->exit_intr_info)) { 6226 kvm_before_interrupt(&vmx->vcpu); 6227 asm("int $2"); 6228 kvm_after_interrupt(&vmx->vcpu); 6229 } 6230 } 6231 6232 static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) 6233 { 6234 unsigned int vector; 6235 unsigned long entry; 6236 #ifdef CONFIG_X86_64 6237 unsigned long tmp; 6238 #endif 6239 gate_desc *desc; 6240 u32 intr_info; 6241 6242 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6243 if (WARN_ONCE(!is_external_intr(intr_info), 6244 "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) 6245 return; 6246 6247 vector = intr_info & INTR_INFO_VECTOR_MASK; 6248 desc = (gate_desc *)host_idt_base + vector; 6249 entry = gate_offset(desc); 6250 6251 kvm_before_interrupt(vcpu); 6252 6253 asm volatile( 6254 #ifdef CONFIG_X86_64 6255 "mov %%" _ASM_SP ", %[sp]\n\t" 6256 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" 6257 "push $%c[ss]\n\t" 6258 "push %[sp]\n\t" 6259 #endif 6260 "pushf\n\t" 6261 __ASM_SIZE(push) " $%c[cs]\n\t" 6262 CALL_NOSPEC 6263 : 6264 #ifdef CONFIG_X86_64 6265 [sp]"=&r"(tmp), 6266 #endif 6267 ASM_CALL_CONSTRAINT 6268 : 6269 THUNK_TARGET(entry), 6270 [ss]"i"(__KERNEL_DS), 6271 [cs]"i"(__KERNEL_CS) 6272 ); 6273 6274 kvm_after_interrupt(vcpu); 6275 } 6276 STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); 6277 6278 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) 6279 { 6280 struct vcpu_vmx *vmx = to_vmx(vcpu); 6281 6282 if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 6283 handle_external_interrupt_irqoff(vcpu); 6284 else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) 6285 handle_exception_nmi_irqoff(vmx); 6286 } 6287 6288 static bool vmx_has_emulated_msr(int index) 6289 { 6290 switch (index) { 6291 case MSR_IA32_SMBASE: 6292 /* 6293 * We cannot do SMM unless we can run the guest in big 6294 * real mode. 6295 */ 6296 return enable_unrestricted_guest || emulate_invalid_guest_state; 6297 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 6298 return nested; 6299 case MSR_AMD64_VIRT_SPEC_CTRL: 6300 /* This is AMD only. */ 6301 return false; 6302 default: 6303 return true; 6304 } 6305 } 6306 6307 static bool vmx_pt_supported(void) 6308 { 6309 return pt_mode == PT_MODE_HOST_GUEST; 6310 } 6311 6312 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 6313 { 6314 u32 exit_intr_info; 6315 bool unblock_nmi; 6316 u8 vector; 6317 bool idtv_info_valid; 6318 6319 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6320 6321 if (enable_vnmi) { 6322 if (vmx->loaded_vmcs->nmi_known_unmasked) 6323 return; 6324 /* 6325 * Can't use vmx->exit_intr_info since we're not sure what 6326 * the exit reason is. 6327 */ 6328 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6329 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 6330 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 6331 /* 6332 * SDM 3: 27.7.1.2 (September 2008) 6333 * Re-set bit "block by NMI" before VM entry if vmexit caused by 6334 * a guest IRET fault. 6335 * SDM 3: 23.2.2 (September 2008) 6336 * Bit 12 is undefined in any of the following cases: 6337 * If the VM exit sets the valid bit in the IDT-vectoring 6338 * information field. 6339 * If the VM exit is due to a double fault. 6340 */ 6341 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 6342 vector != DF_VECTOR && !idtv_info_valid) 6343 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6344 GUEST_INTR_STATE_NMI); 6345 else 6346 vmx->loaded_vmcs->nmi_known_unmasked = 6347 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 6348 & GUEST_INTR_STATE_NMI); 6349 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 6350 vmx->loaded_vmcs->vnmi_blocked_time += 6351 ktime_to_ns(ktime_sub(ktime_get(), 6352 vmx->loaded_vmcs->entry_time)); 6353 } 6354 6355 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 6356 u32 idt_vectoring_info, 6357 int instr_len_field, 6358 int error_code_field) 6359 { 6360 u8 vector; 6361 int type; 6362 bool idtv_info_valid; 6363 6364 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6365 6366 vcpu->arch.nmi_injected = false; 6367 kvm_clear_exception_queue(vcpu); 6368 kvm_clear_interrupt_queue(vcpu); 6369 6370 if (!idtv_info_valid) 6371 return; 6372 6373 kvm_make_request(KVM_REQ_EVENT, vcpu); 6374 6375 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 6376 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 6377 6378 switch (type) { 6379 case INTR_TYPE_NMI_INTR: 6380 vcpu->arch.nmi_injected = true; 6381 /* 6382 * SDM 3: 27.7.1.2 (September 2008) 6383 * Clear bit "block by NMI" before VM entry if a NMI 6384 * delivery faulted. 6385 */ 6386 vmx_set_nmi_mask(vcpu, false); 6387 break; 6388 case INTR_TYPE_SOFT_EXCEPTION: 6389 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 6390 /* fall through */ 6391 case INTR_TYPE_HARD_EXCEPTION: 6392 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 6393 u32 err = vmcs_read32(error_code_field); 6394 kvm_requeue_exception_e(vcpu, vector, err); 6395 } else 6396 kvm_requeue_exception(vcpu, vector); 6397 break; 6398 case INTR_TYPE_SOFT_INTR: 6399 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 6400 /* fall through */ 6401 case INTR_TYPE_EXT_INTR: 6402 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 6403 break; 6404 default: 6405 break; 6406 } 6407 } 6408 6409 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 6410 { 6411 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 6412 VM_EXIT_INSTRUCTION_LEN, 6413 IDT_VECTORING_ERROR_CODE); 6414 } 6415 6416 static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 6417 { 6418 __vmx_complete_interrupts(vcpu, 6419 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6420 VM_ENTRY_INSTRUCTION_LEN, 6421 VM_ENTRY_EXCEPTION_ERROR_CODE); 6422 6423 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 6424 } 6425 6426 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 6427 { 6428 int i, nr_msrs; 6429 struct perf_guest_switch_msr *msrs; 6430 6431 msrs = perf_guest_get_msrs(&nr_msrs); 6432 6433 if (!msrs) 6434 return; 6435 6436 for (i = 0; i < nr_msrs; i++) 6437 if (msrs[i].host == msrs[i].guest) 6438 clear_atomic_switch_msr(vmx, msrs[i].msr); 6439 else 6440 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 6441 msrs[i].host, false); 6442 } 6443 6444 static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx) 6445 { 6446 u32 host_umwait_control; 6447 6448 if (!vmx_has_waitpkg(vmx)) 6449 return; 6450 6451 host_umwait_control = get_umwait_control_msr(); 6452 6453 if (vmx->msr_ia32_umwait_control != host_umwait_control) 6454 add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL, 6455 vmx->msr_ia32_umwait_control, 6456 host_umwait_control, false); 6457 else 6458 clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL); 6459 } 6460 6461 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) 6462 { 6463 struct vcpu_vmx *vmx = to_vmx(vcpu); 6464 u64 tscl; 6465 u32 delta_tsc; 6466 6467 if (vmx->req_immediate_exit) { 6468 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0); 6469 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 6470 } else if (vmx->hv_deadline_tsc != -1) { 6471 tscl = rdtsc(); 6472 if (vmx->hv_deadline_tsc > tscl) 6473 /* set_hv_timer ensures the delta fits in 32-bits */ 6474 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 6475 cpu_preemption_timer_multi); 6476 else 6477 delta_tsc = 0; 6478 6479 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); 6480 vmx->loaded_vmcs->hv_timer_soft_disabled = false; 6481 } else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) { 6482 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, -1); 6483 vmx->loaded_vmcs->hv_timer_soft_disabled = true; 6484 } 6485 } 6486 6487 void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 6488 { 6489 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 6490 vmx->loaded_vmcs->host_state.rsp = host_rsp; 6491 vmcs_writel(HOST_RSP, host_rsp); 6492 } 6493 } 6494 6495 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); 6496 6497 static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 6498 { 6499 struct vcpu_vmx *vmx = to_vmx(vcpu); 6500 unsigned long cr3, cr4; 6501 6502 /* Record the guest's net vcpu time for enforced NMI injections. */ 6503 if (unlikely(!enable_vnmi && 6504 vmx->loaded_vmcs->soft_vnmi_blocked)) 6505 vmx->loaded_vmcs->entry_time = ktime_get(); 6506 6507 /* Don't enter VMX if guest state is invalid, let the exit handler 6508 start emulation until we arrive back to a valid state */ 6509 if (vmx->emulation_required) 6510 return; 6511 6512 if (vmx->ple_window_dirty) { 6513 vmx->ple_window_dirty = false; 6514 vmcs_write32(PLE_WINDOW, vmx->ple_window); 6515 } 6516 6517 if (vmx->nested.need_vmcs12_to_shadow_sync) 6518 nested_sync_vmcs12_to_shadow(vcpu); 6519 6520 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 6521 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 6522 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 6523 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 6524 6525 cr3 = __get_current_cr3_fast(); 6526 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 6527 vmcs_writel(HOST_CR3, cr3); 6528 vmx->loaded_vmcs->host_state.cr3 = cr3; 6529 } 6530 6531 cr4 = cr4_read_shadow(); 6532 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 6533 vmcs_writel(HOST_CR4, cr4); 6534 vmx->loaded_vmcs->host_state.cr4 = cr4; 6535 } 6536 6537 /* When single-stepping over STI and MOV SS, we must clear the 6538 * corresponding interruptibility bits in the guest state. Otherwise 6539 * vmentry fails as it then expects bit 14 (BS) in pending debug 6540 * exceptions being set, but that's not correct for the guest debugging 6541 * case. */ 6542 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 6543 vmx_set_interrupt_shadow(vcpu, 0); 6544 6545 kvm_load_guest_xcr0(vcpu); 6546 6547 if (static_cpu_has(X86_FEATURE_PKU) && 6548 kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && 6549 vcpu->arch.pkru != vmx->host_pkru) 6550 __write_pkru(vcpu->arch.pkru); 6551 6552 pt_guest_enter(vmx); 6553 6554 atomic_switch_perf_msrs(vmx); 6555 atomic_switch_umwait_control_msr(vmx); 6556 6557 if (enable_preemption_timer) 6558 vmx_update_hv_timer(vcpu); 6559 6560 if (lapic_in_kernel(vcpu) && 6561 vcpu->arch.apic->lapic_timer.timer_advance_ns) 6562 kvm_wait_lapic_expire(vcpu); 6563 6564 /* 6565 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 6566 * it's non-zero. Since vmentry is serialising on affected CPUs, there 6567 * is no need to worry about the conditional branch over the wrmsr 6568 * being speculatively taken. 6569 */ 6570 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); 6571 6572 /* L1D Flush includes CPU buffer clear to mitigate MDS */ 6573 if (static_branch_unlikely(&vmx_l1d_should_flush)) 6574 vmx_l1d_flush(vcpu); 6575 else if (static_branch_unlikely(&mds_user_clear)) 6576 mds_clear_cpu_buffers(); 6577 6578 if (vcpu->arch.cr2 != read_cr2()) 6579 write_cr2(vcpu->arch.cr2); 6580 6581 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 6582 vmx->loaded_vmcs->launched); 6583 6584 vcpu->arch.cr2 = read_cr2(); 6585 6586 /* 6587 * We do not use IBRS in the kernel. If this vCPU has used the 6588 * SPEC_CTRL MSR it may have left it on; save the value and 6589 * turn it off. This is much more efficient than blindly adding 6590 * it to the atomic save/restore list. Especially as the former 6591 * (Saving guest MSRs on vmexit) doesn't even exist in KVM. 6592 * 6593 * For non-nested case: 6594 * If the L01 MSR bitmap does not intercept the MSR, then we need to 6595 * save it. 6596 * 6597 * For nested case: 6598 * If the L02 MSR bitmap does not intercept the MSR, then we need to 6599 * save it. 6600 */ 6601 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) 6602 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); 6603 6604 x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); 6605 6606 /* All fields are clean at this point */ 6607 if (static_branch_unlikely(&enable_evmcs)) 6608 current_evmcs->hv_clean_fields |= 6609 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 6610 6611 if (static_branch_unlikely(&enable_evmcs)) 6612 current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index; 6613 6614 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 6615 if (vmx->host_debugctlmsr) 6616 update_debugctlmsr(vmx->host_debugctlmsr); 6617 6618 #ifndef CONFIG_X86_64 6619 /* 6620 * The sysexit path does not restore ds/es, so we must set them to 6621 * a reasonable value ourselves. 6622 * 6623 * We can't defer this to vmx_prepare_switch_to_host() since that 6624 * function may be executed in interrupt context, which saves and 6625 * restore segments around it, nullifying its effect. 6626 */ 6627 loadsegment(ds, __USER_DS); 6628 loadsegment(es, __USER_DS); 6629 #endif 6630 6631 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 6632 | (1 << VCPU_EXREG_RFLAGS) 6633 | (1 << VCPU_EXREG_PDPTR) 6634 | (1 << VCPU_EXREG_SEGMENTS) 6635 | (1 << VCPU_EXREG_CR3)); 6636 vcpu->arch.regs_dirty = 0; 6637 6638 pt_guest_exit(vmx); 6639 6640 /* 6641 * eager fpu is enabled if PKEY is supported and CR4 is switched 6642 * back on host, so it is safe to read guest PKRU from current 6643 * XSAVE. 6644 */ 6645 if (static_cpu_has(X86_FEATURE_PKU) && 6646 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { 6647 vcpu->arch.pkru = rdpkru(); 6648 if (vcpu->arch.pkru != vmx->host_pkru) 6649 __write_pkru(vmx->host_pkru); 6650 } 6651 6652 kvm_put_guest_xcr0(vcpu); 6653 6654 vmx->nested.nested_run_pending = 0; 6655 vmx->idt_vectoring_info = 0; 6656 6657 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); 6658 if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) 6659 kvm_machine_check(); 6660 6661 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) 6662 return; 6663 6664 vmx->loaded_vmcs->launched = 1; 6665 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 6666 6667 vmx_recover_nmi_blocking(vmx); 6668 vmx_complete_interrupts(vmx); 6669 } 6670 6671 static struct kvm *vmx_vm_alloc(void) 6672 { 6673 struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx), 6674 GFP_KERNEL_ACCOUNT | __GFP_ZERO, 6675 PAGE_KERNEL); 6676 return &kvm_vmx->kvm; 6677 } 6678 6679 static void vmx_vm_free(struct kvm *kvm) 6680 { 6681 kfree(kvm->arch.hyperv.hv_pa_pg); 6682 vfree(to_kvm_vmx(kvm)); 6683 } 6684 6685 static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 6686 { 6687 struct vcpu_vmx *vmx = to_vmx(vcpu); 6688 6689 if (enable_pml) 6690 vmx_destroy_pml_buffer(vmx); 6691 free_vpid(vmx->vpid); 6692 nested_vmx_free_vcpu(vcpu); 6693 free_loaded_vmcs(vmx->loaded_vmcs); 6694 kfree(vmx->guest_msrs); 6695 kvm_vcpu_uninit(vcpu); 6696 kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); 6697 kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); 6698 kmem_cache_free(kvm_vcpu_cache, vmx); 6699 } 6700 6701 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 6702 { 6703 int err; 6704 struct vcpu_vmx *vmx; 6705 unsigned long *msr_bitmap; 6706 int cpu; 6707 6708 BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, 6709 "struct kvm_vcpu must be at offset 0 for arch usercopy region"); 6710 6711 vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); 6712 if (!vmx) 6713 return ERR_PTR(-ENOMEM); 6714 6715 vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, 6716 GFP_KERNEL_ACCOUNT); 6717 if (!vmx->vcpu.arch.user_fpu) { 6718 printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); 6719 err = -ENOMEM; 6720 goto free_partial_vcpu; 6721 } 6722 6723 vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, 6724 GFP_KERNEL_ACCOUNT); 6725 if (!vmx->vcpu.arch.guest_fpu) { 6726 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); 6727 err = -ENOMEM; 6728 goto free_user_fpu; 6729 } 6730 6731 vmx->vpid = allocate_vpid(); 6732 6733 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 6734 if (err) 6735 goto free_vcpu; 6736 6737 err = -ENOMEM; 6738 6739 /* 6740 * If PML is turned on, failure on enabling PML just results in failure 6741 * of creating the vcpu, therefore we can simplify PML logic (by 6742 * avoiding dealing with cases, such as enabling PML partially on vcpus 6743 * for the guest, etc. 6744 */ 6745 if (enable_pml) { 6746 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 6747 if (!vmx->pml_pg) 6748 goto uninit_vcpu; 6749 } 6750 6751 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); 6752 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) 6753 > PAGE_SIZE); 6754 6755 if (!vmx->guest_msrs) 6756 goto free_pml; 6757 6758 err = alloc_loaded_vmcs(&vmx->vmcs01); 6759 if (err < 0) 6760 goto free_msrs; 6761 6762 msr_bitmap = vmx->vmcs01.msr_bitmap; 6763 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); 6764 vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); 6765 vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); 6766 vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 6767 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 6768 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 6769 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 6770 if (kvm_cstate_in_guest(kvm)) { 6771 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R); 6772 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R); 6773 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); 6774 vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); 6775 } 6776 vmx->msr_bitmap_mode = 0; 6777 6778 vmx->loaded_vmcs = &vmx->vmcs01; 6779 cpu = get_cpu(); 6780 vmx_vcpu_load(&vmx->vcpu, cpu); 6781 vmx->vcpu.cpu = cpu; 6782 vmx_vcpu_setup(vmx); 6783 vmx_vcpu_put(&vmx->vcpu); 6784 put_cpu(); 6785 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { 6786 err = alloc_apic_access_page(kvm); 6787 if (err) 6788 goto free_vmcs; 6789 } 6790 6791 if (enable_ept && !enable_unrestricted_guest) { 6792 err = init_rmode_identity_map(kvm); 6793 if (err) 6794 goto free_vmcs; 6795 } 6796 6797 if (nested) 6798 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, 6799 vmx_capability.ept, 6800 kvm_vcpu_apicv_active(&vmx->vcpu)); 6801 else 6802 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); 6803 6804 vmx->nested.posted_intr_nv = -1; 6805 vmx->nested.current_vmptr = -1ull; 6806 6807 vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; 6808 6809 /* 6810 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 6811 * or POSTED_INTR_WAKEUP_VECTOR. 6812 */ 6813 vmx->pi_desc.nv = POSTED_INTR_VECTOR; 6814 vmx->pi_desc.sn = 1; 6815 6816 vmx->ept_pointer = INVALID_PAGE; 6817 6818 return &vmx->vcpu; 6819 6820 free_vmcs: 6821 free_loaded_vmcs(vmx->loaded_vmcs); 6822 free_msrs: 6823 kfree(vmx->guest_msrs); 6824 free_pml: 6825 vmx_destroy_pml_buffer(vmx); 6826 uninit_vcpu: 6827 kvm_vcpu_uninit(&vmx->vcpu); 6828 free_vcpu: 6829 free_vpid(vmx->vpid); 6830 kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); 6831 free_user_fpu: 6832 kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); 6833 free_partial_vcpu: 6834 kmem_cache_free(kvm_vcpu_cache, vmx); 6835 return ERR_PTR(err); 6836 } 6837 6838 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 6839 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n" 6840 6841 static int vmx_vm_init(struct kvm *kvm) 6842 { 6843 spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); 6844 6845 if (!ple_gap) 6846 kvm->arch.pause_in_guest = true; 6847 6848 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 6849 switch (l1tf_mitigation) { 6850 case L1TF_MITIGATION_OFF: 6851 case L1TF_MITIGATION_FLUSH_NOWARN: 6852 /* 'I explicitly don't care' is set */ 6853 break; 6854 case L1TF_MITIGATION_FLUSH: 6855 case L1TF_MITIGATION_FLUSH_NOSMT: 6856 case L1TF_MITIGATION_FULL: 6857 /* 6858 * Warn upon starting the first VM in a potentially 6859 * insecure environment. 6860 */ 6861 if (sched_smt_active()) 6862 pr_warn_once(L1TF_MSG_SMT); 6863 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 6864 pr_warn_once(L1TF_MSG_L1D); 6865 break; 6866 case L1TF_MITIGATION_FULL_FORCE: 6867 /* Flush is enforced */ 6868 break; 6869 } 6870 } 6871 return 0; 6872 } 6873 6874 static int __init vmx_check_processor_compat(void) 6875 { 6876 struct vmcs_config vmcs_conf; 6877 struct vmx_capability vmx_cap; 6878 6879 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) 6880 return -EIO; 6881 if (nested) 6882 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, 6883 enable_apicv); 6884 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { 6885 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", 6886 smp_processor_id()); 6887 return -EIO; 6888 } 6889 return 0; 6890 } 6891 6892 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 6893 { 6894 u8 cache; 6895 u64 ipat = 0; 6896 6897 /* For VT-d and EPT combination 6898 * 1. MMIO: always map as UC 6899 * 2. EPT with VT-d: 6900 * a. VT-d without snooping control feature: can't guarantee the 6901 * result, try to trust guest. 6902 * b. VT-d with snooping control feature: snooping control feature of 6903 * VT-d engine can guarantee the cache correctness. Just set it 6904 * to WB to keep consistent with host. So the same as item 3. 6905 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep 6906 * consistent with host MTRR 6907 */ 6908 if (is_mmio) { 6909 cache = MTRR_TYPE_UNCACHABLE; 6910 goto exit; 6911 } 6912 6913 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { 6914 ipat = VMX_EPT_IPAT_BIT; 6915 cache = MTRR_TYPE_WRBACK; 6916 goto exit; 6917 } 6918 6919 if (kvm_read_cr0(vcpu) & X86_CR0_CD) { 6920 ipat = VMX_EPT_IPAT_BIT; 6921 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 6922 cache = MTRR_TYPE_WRBACK; 6923 else 6924 cache = MTRR_TYPE_UNCACHABLE; 6925 goto exit; 6926 } 6927 6928 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); 6929 6930 exit: 6931 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; 6932 } 6933 6934 static int vmx_get_lpage_level(void) 6935 { 6936 if (enable_ept && !cpu_has_vmx_ept_1g_page()) 6937 return PT_DIRECTORY_LEVEL; 6938 else 6939 /* For shadow and EPT supported 1GB page */ 6940 return PT_PDPE_LEVEL; 6941 } 6942 6943 static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) 6944 { 6945 /* 6946 * These bits in the secondary execution controls field 6947 * are dynamic, the others are mostly based on the hypervisor 6948 * architecture and the guest's CPUID. Do not touch the 6949 * dynamic bits. 6950 */ 6951 u32 mask = 6952 SECONDARY_EXEC_SHADOW_VMCS | 6953 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6954 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6955 SECONDARY_EXEC_DESC; 6956 6957 u32 new_ctl = vmx->secondary_exec_control; 6958 u32 cur_ctl = secondary_exec_controls_get(vmx); 6959 6960 secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); 6961 } 6962 6963 /* 6964 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 6965 * (indicating "allowed-1") if they are supported in the guest's CPUID. 6966 */ 6967 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 6968 { 6969 struct vcpu_vmx *vmx = to_vmx(vcpu); 6970 struct kvm_cpuid_entry2 *entry; 6971 6972 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 6973 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 6974 6975 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 6976 if (entry && (entry->_reg & (_cpuid_mask))) \ 6977 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 6978 } while (0) 6979 6980 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); 6981 cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); 6982 cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); 6983 cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); 6984 cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); 6985 cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); 6986 cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); 6987 cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); 6988 cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); 6989 cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); 6990 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); 6991 cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); 6992 cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); 6993 cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); 6994 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); 6995 6996 entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); 6997 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); 6998 cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); 6999 cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); 7000 cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); 7001 cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); 7002 7003 #undef cr4_fixed1_update 7004 } 7005 7006 static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 7007 { 7008 struct vcpu_vmx *vmx = to_vmx(vcpu); 7009 7010 if (kvm_mpx_supported()) { 7011 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); 7012 7013 if (mpx_enabled) { 7014 vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 7015 vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 7016 } else { 7017 vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; 7018 vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; 7019 } 7020 } 7021 } 7022 7023 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 7024 { 7025 struct vcpu_vmx *vmx = to_vmx(vcpu); 7026 struct kvm_cpuid_entry2 *best = NULL; 7027 int i; 7028 7029 for (i = 0; i < PT_CPUID_LEAVES; i++) { 7030 best = kvm_find_cpuid_entry(vcpu, 0x14, i); 7031 if (!best) 7032 return; 7033 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 7034 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 7035 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 7036 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 7037 } 7038 7039 /* Get the number of configurable Address Ranges for filtering */ 7040 vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, 7041 PT_CAP_num_address_ranges); 7042 7043 /* Initialize and clear the no dependency bits */ 7044 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 7045 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); 7046 7047 /* 7048 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 7049 * will inject an #GP 7050 */ 7051 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 7052 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 7053 7054 /* 7055 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 7056 * PSBFreq can be set 7057 */ 7058 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 7059 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 7060 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 7061 7062 /* 7063 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and 7064 * MTCFreq can be set 7065 */ 7066 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 7067 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 7068 RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); 7069 7070 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 7071 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 7072 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 7073 RTIT_CTL_PTW_EN); 7074 7075 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 7076 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 7077 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 7078 7079 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 7080 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 7081 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 7082 7083 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ 7084 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 7085 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 7086 7087 /* unmask address range configure area */ 7088 for (i = 0; i < vmx->pt_desc.addr_range; i++) 7089 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 7090 } 7091 7092 static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 7093 { 7094 struct vcpu_vmx *vmx = to_vmx(vcpu); 7095 7096 if (cpu_has_secondary_exec_ctrls()) { 7097 vmx_compute_secondary_exec_control(vmx); 7098 vmcs_set_secondary_exec_control(vmx); 7099 } 7100 7101 if (nested_vmx_allowed(vcpu)) 7102 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 7103 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 7104 else 7105 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 7106 ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 7107 7108 if (nested_vmx_allowed(vcpu)) { 7109 nested_vmx_cr_fixed1_bits_update(vcpu); 7110 nested_vmx_entry_exit_ctls_update(vcpu); 7111 } 7112 7113 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 7114 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) 7115 update_intel_pt_cfg(vcpu); 7116 } 7117 7118 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 7119 { 7120 if (func == 1 && nested) 7121 entry->ecx |= bit(X86_FEATURE_VMX); 7122 } 7123 7124 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) 7125 { 7126 to_vmx(vcpu)->req_immediate_exit = true; 7127 } 7128 7129 static int vmx_check_intercept(struct kvm_vcpu *vcpu, 7130 struct x86_instruction_info *info, 7131 enum x86_intercept_stage stage) 7132 { 7133 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7134 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 7135 7136 /* 7137 * RDPID causes #UD if disabled through secondary execution controls. 7138 * Because it is marked as EmulateOnUD, we need to intercept it here. 7139 */ 7140 if (info->intercept == x86_intercept_rdtscp && 7141 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { 7142 ctxt->exception.vector = UD_VECTOR; 7143 ctxt->exception.error_code_valid = false; 7144 return X86EMUL_PROPAGATE_FAULT; 7145 } 7146 7147 /* TODO: check more intercepts... */ 7148 return X86EMUL_CONTINUE; 7149 } 7150 7151 #ifdef CONFIG_X86_64 7152 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 7153 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 7154 u64 divisor, u64 *result) 7155 { 7156 u64 low = a << shift, high = a >> (64 - shift); 7157 7158 /* To avoid the overflow on divq */ 7159 if (high >= divisor) 7160 return 1; 7161 7162 /* Low hold the result, high hold rem which is discarded */ 7163 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 7164 "rm" (divisor), "0" (low), "1" (high)); 7165 *result = low; 7166 7167 return 0; 7168 } 7169 7170 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, 7171 bool *expired) 7172 { 7173 struct vcpu_vmx *vmx; 7174 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 7175 struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; 7176 7177 if (kvm_mwait_in_guest(vcpu->kvm) || 7178 kvm_can_post_timer_interrupt(vcpu)) 7179 return -EOPNOTSUPP; 7180 7181 vmx = to_vmx(vcpu); 7182 tscl = rdtsc(); 7183 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 7184 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 7185 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, 7186 ktimer->timer_advance_ns); 7187 7188 if (delta_tsc > lapic_timer_advance_cycles) 7189 delta_tsc -= lapic_timer_advance_cycles; 7190 else 7191 delta_tsc = 0; 7192 7193 /* Convert to host delta tsc if tsc scaling is enabled */ 7194 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && 7195 delta_tsc && u64_shl_div_u64(delta_tsc, 7196 kvm_tsc_scaling_ratio_frac_bits, 7197 vcpu->arch.tsc_scaling_ratio, &delta_tsc)) 7198 return -ERANGE; 7199 7200 /* 7201 * If the delta tsc can't fit in the 32 bit after the multi shift, 7202 * we can't use the preemption timer. 7203 * It's possible that it fits on later vmentries, but checking 7204 * on every vmentry is costly so we just use an hrtimer. 7205 */ 7206 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 7207 return -ERANGE; 7208 7209 vmx->hv_deadline_tsc = tscl + delta_tsc; 7210 *expired = !delta_tsc; 7211 return 0; 7212 } 7213 7214 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 7215 { 7216 to_vmx(vcpu)->hv_deadline_tsc = -1; 7217 } 7218 #endif 7219 7220 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 7221 { 7222 if (!kvm_pause_in_guest(vcpu->kvm)) 7223 shrink_ple_window(vcpu); 7224 } 7225 7226 static void vmx_slot_enable_log_dirty(struct kvm *kvm, 7227 struct kvm_memory_slot *slot) 7228 { 7229 kvm_mmu_slot_leaf_clear_dirty(kvm, slot); 7230 kvm_mmu_slot_largepage_remove_write_access(kvm, slot); 7231 } 7232 7233 static void vmx_slot_disable_log_dirty(struct kvm *kvm, 7234 struct kvm_memory_slot *slot) 7235 { 7236 kvm_mmu_slot_set_dirty(kvm, slot); 7237 } 7238 7239 static void vmx_flush_log_dirty(struct kvm *kvm) 7240 { 7241 kvm_flush_pml_buffers(kvm); 7242 } 7243 7244 static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) 7245 { 7246 struct vmcs12 *vmcs12; 7247 struct vcpu_vmx *vmx = to_vmx(vcpu); 7248 gpa_t gpa, dst; 7249 7250 if (is_guest_mode(vcpu)) { 7251 WARN_ON_ONCE(vmx->nested.pml_full); 7252 7253 /* 7254 * Check if PML is enabled for the nested guest. 7255 * Whether eptp bit 6 is set is already checked 7256 * as part of A/D emulation. 7257 */ 7258 vmcs12 = get_vmcs12(vcpu); 7259 if (!nested_cpu_has_pml(vmcs12)) 7260 return 0; 7261 7262 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 7263 vmx->nested.pml_full = true; 7264 return 1; 7265 } 7266 7267 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; 7268 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 7269 7270 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 7271 offset_in_page(dst), sizeof(gpa))) 7272 return 0; 7273 7274 vmcs12->guest_pml_index--; 7275 } 7276 7277 return 0; 7278 } 7279 7280 static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, 7281 struct kvm_memory_slot *memslot, 7282 gfn_t offset, unsigned long mask) 7283 { 7284 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); 7285 } 7286 7287 static void __pi_post_block(struct kvm_vcpu *vcpu) 7288 { 7289 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 7290 struct pi_desc old, new; 7291 unsigned int dest; 7292 7293 do { 7294 old.control = new.control = pi_desc->control; 7295 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, 7296 "Wakeup handler not enabled while the VCPU is blocked\n"); 7297 7298 dest = cpu_physical_id(vcpu->cpu); 7299 7300 if (x2apic_enabled()) 7301 new.ndst = dest; 7302 else 7303 new.ndst = (dest << 8) & 0xFF00; 7304 7305 /* set 'NV' to 'notification vector' */ 7306 new.nv = POSTED_INTR_VECTOR; 7307 } while (cmpxchg64(&pi_desc->control, old.control, 7308 new.control) != old.control); 7309 7310 if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { 7311 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 7312 list_del(&vcpu->blocked_vcpu_list); 7313 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 7314 vcpu->pre_pcpu = -1; 7315 } 7316 } 7317 7318 /* 7319 * This routine does the following things for vCPU which is going 7320 * to be blocked if VT-d PI is enabled. 7321 * - Store the vCPU to the wakeup list, so when interrupts happen 7322 * we can find the right vCPU to wake up. 7323 * - Change the Posted-interrupt descriptor as below: 7324 * 'NDST' <-- vcpu->pre_pcpu 7325 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR 7326 * - If 'ON' is set during this process, which means at least one 7327 * interrupt is posted for this vCPU, we cannot block it, in 7328 * this case, return 1, otherwise, return 0. 7329 * 7330 */ 7331 static int pi_pre_block(struct kvm_vcpu *vcpu) 7332 { 7333 unsigned int dest; 7334 struct pi_desc old, new; 7335 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 7336 7337 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 7338 !irq_remapping_cap(IRQ_POSTING_CAP) || 7339 !kvm_vcpu_apicv_active(vcpu)) 7340 return 0; 7341 7342 WARN_ON(irqs_disabled()); 7343 local_irq_disable(); 7344 if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { 7345 vcpu->pre_pcpu = vcpu->cpu; 7346 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 7347 list_add_tail(&vcpu->blocked_vcpu_list, 7348 &per_cpu(blocked_vcpu_on_cpu, 7349 vcpu->pre_pcpu)); 7350 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 7351 } 7352 7353 do { 7354 old.control = new.control = pi_desc->control; 7355 7356 WARN((pi_desc->sn == 1), 7357 "Warning: SN field of posted-interrupts " 7358 "is set before blocking\n"); 7359 7360 /* 7361 * Since vCPU can be preempted during this process, 7362 * vcpu->cpu could be different with pre_pcpu, we 7363 * need to set pre_pcpu as the destination of wakeup 7364 * notification event, then we can find the right vCPU 7365 * to wakeup in wakeup handler if interrupts happen 7366 * when the vCPU is in blocked state. 7367 */ 7368 dest = cpu_physical_id(vcpu->pre_pcpu); 7369 7370 if (x2apic_enabled()) 7371 new.ndst = dest; 7372 else 7373 new.ndst = (dest << 8) & 0xFF00; 7374 7375 /* set 'NV' to 'wakeup vector' */ 7376 new.nv = POSTED_INTR_WAKEUP_VECTOR; 7377 } while (cmpxchg64(&pi_desc->control, old.control, 7378 new.control) != old.control); 7379 7380 /* We should not block the vCPU if an interrupt is posted for it. */ 7381 if (pi_test_on(pi_desc) == 1) 7382 __pi_post_block(vcpu); 7383 7384 local_irq_enable(); 7385 return (vcpu->pre_pcpu == -1); 7386 } 7387 7388 static int vmx_pre_block(struct kvm_vcpu *vcpu) 7389 { 7390 if (pi_pre_block(vcpu)) 7391 return 1; 7392 7393 if (kvm_lapic_hv_timer_in_use(vcpu)) 7394 kvm_lapic_switch_to_sw_timer(vcpu); 7395 7396 return 0; 7397 } 7398 7399 static void pi_post_block(struct kvm_vcpu *vcpu) 7400 { 7401 if (vcpu->pre_pcpu == -1) 7402 return; 7403 7404 WARN_ON(irqs_disabled()); 7405 local_irq_disable(); 7406 __pi_post_block(vcpu); 7407 local_irq_enable(); 7408 } 7409 7410 static void vmx_post_block(struct kvm_vcpu *vcpu) 7411 { 7412 if (kvm_x86_ops->set_hv_timer) 7413 kvm_lapic_switch_to_hv_timer(vcpu); 7414 7415 pi_post_block(vcpu); 7416 } 7417 7418 /* 7419 * vmx_update_pi_irte - set IRTE for Posted-Interrupts 7420 * 7421 * @kvm: kvm 7422 * @host_irq: host irq of the interrupt 7423 * @guest_irq: gsi of the interrupt 7424 * @set: set or unset PI 7425 * returns 0 on success, < 0 on failure 7426 */ 7427 static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, 7428 uint32_t guest_irq, bool set) 7429 { 7430 struct kvm_kernel_irq_routing_entry *e; 7431 struct kvm_irq_routing_table *irq_rt; 7432 struct kvm_lapic_irq irq; 7433 struct kvm_vcpu *vcpu; 7434 struct vcpu_data vcpu_info; 7435 int idx, ret = 0; 7436 7437 if (!kvm_arch_has_assigned_device(kvm) || 7438 !irq_remapping_cap(IRQ_POSTING_CAP) || 7439 !kvm_vcpu_apicv_active(kvm->vcpus[0])) 7440 return 0; 7441 7442 idx = srcu_read_lock(&kvm->irq_srcu); 7443 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 7444 if (guest_irq >= irq_rt->nr_rt_entries || 7445 hlist_empty(&irq_rt->map[guest_irq])) { 7446 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", 7447 guest_irq, irq_rt->nr_rt_entries); 7448 goto out; 7449 } 7450 7451 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 7452 if (e->type != KVM_IRQ_ROUTING_MSI) 7453 continue; 7454 /* 7455 * VT-d PI cannot support posting multicast/broadcast 7456 * interrupts to a vCPU, we still use interrupt remapping 7457 * for these kind of interrupts. 7458 * 7459 * For lowest-priority interrupts, we only support 7460 * those with single CPU as the destination, e.g. user 7461 * configures the interrupts via /proc/irq or uses 7462 * irqbalance to make the interrupts single-CPU. 7463 * 7464 * We will support full lowest-priority interrupt later. 7465 * 7466 * In addition, we can only inject generic interrupts using 7467 * the PI mechanism, refuse to route others through it. 7468 */ 7469 7470 kvm_set_msi_irq(kvm, e, &irq); 7471 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || 7472 !kvm_irq_is_postable(&irq)) { 7473 /* 7474 * Make sure the IRTE is in remapped mode if 7475 * we don't handle it in posted mode. 7476 */ 7477 ret = irq_set_vcpu_affinity(host_irq, NULL); 7478 if (ret < 0) { 7479 printk(KERN_INFO 7480 "failed to back to remapped mode, irq: %u\n", 7481 host_irq); 7482 goto out; 7483 } 7484 7485 continue; 7486 } 7487 7488 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); 7489 vcpu_info.vector = irq.vector; 7490 7491 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, 7492 vcpu_info.vector, vcpu_info.pi_desc_addr, set); 7493 7494 if (set) 7495 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); 7496 else 7497 ret = irq_set_vcpu_affinity(host_irq, NULL); 7498 7499 if (ret < 0) { 7500 printk(KERN_INFO "%s: failed to update PI IRTE\n", 7501 __func__); 7502 goto out; 7503 } 7504 } 7505 7506 ret = 0; 7507 out: 7508 srcu_read_unlock(&kvm->irq_srcu, idx); 7509 return ret; 7510 } 7511 7512 static void vmx_setup_mce(struct kvm_vcpu *vcpu) 7513 { 7514 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 7515 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 7516 FEATURE_CONTROL_LMCE; 7517 else 7518 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 7519 ~FEATURE_CONTROL_LMCE; 7520 } 7521 7522 static int vmx_smi_allowed(struct kvm_vcpu *vcpu) 7523 { 7524 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 7525 if (to_vmx(vcpu)->nested.nested_run_pending) 7526 return 0; 7527 return 1; 7528 } 7529 7530 static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) 7531 { 7532 struct vcpu_vmx *vmx = to_vmx(vcpu); 7533 7534 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 7535 if (vmx->nested.smm.guest_mode) 7536 nested_vmx_vmexit(vcpu, -1, 0, 0); 7537 7538 vmx->nested.smm.vmxon = vmx->nested.vmxon; 7539 vmx->nested.vmxon = false; 7540 vmx_clear_hlt(vcpu); 7541 return 0; 7542 } 7543 7544 static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) 7545 { 7546 struct vcpu_vmx *vmx = to_vmx(vcpu); 7547 int ret; 7548 7549 if (vmx->nested.smm.vmxon) { 7550 vmx->nested.vmxon = true; 7551 vmx->nested.smm.vmxon = false; 7552 } 7553 7554 if (vmx->nested.smm.guest_mode) { 7555 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7556 if (ret) 7557 return ret; 7558 7559 vmx->nested.smm.guest_mode = false; 7560 } 7561 return 0; 7562 } 7563 7564 static int enable_smi_window(struct kvm_vcpu *vcpu) 7565 { 7566 return 0; 7567 } 7568 7569 static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) 7570 { 7571 return false; 7572 } 7573 7574 static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 7575 { 7576 return to_vmx(vcpu)->nested.vmxon; 7577 } 7578 7579 static __init int hardware_setup(void) 7580 { 7581 unsigned long host_bndcfgs; 7582 struct desc_ptr dt; 7583 int r, i; 7584 7585 rdmsrl_safe(MSR_EFER, &host_efer); 7586 7587 store_idt(&dt); 7588 host_idt_base = dt.address; 7589 7590 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) 7591 kvm_define_shared_msr(i, vmx_msr_index[i]); 7592 7593 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 7594 return -EIO; 7595 7596 if (boot_cpu_has(X86_FEATURE_NX)) 7597 kvm_enable_efer_bits(EFER_NX); 7598 7599 if (boot_cpu_has(X86_FEATURE_MPX)) { 7600 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); 7601 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); 7602 } 7603 7604 if (boot_cpu_has(X86_FEATURE_XSAVES)) 7605 rdmsrl(MSR_IA32_XSS, host_xss); 7606 7607 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 7608 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 7609 enable_vpid = 0; 7610 7611 if (!cpu_has_vmx_ept() || 7612 !cpu_has_vmx_ept_4levels() || 7613 !cpu_has_vmx_ept_mt_wb() || 7614 !cpu_has_vmx_invept_global()) 7615 enable_ept = 0; 7616 7617 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 7618 enable_ept_ad_bits = 0; 7619 7620 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 7621 enable_unrestricted_guest = 0; 7622 7623 if (!cpu_has_vmx_flexpriority()) 7624 flexpriority_enabled = 0; 7625 7626 if (!cpu_has_virtual_nmis()) 7627 enable_vnmi = 0; 7628 7629 /* 7630 * set_apic_access_page_addr() is used to reload apic access 7631 * page upon invalidation. No need to do anything if not 7632 * using the APIC_ACCESS_ADDR VMCS field. 7633 */ 7634 if (!flexpriority_enabled) 7635 kvm_x86_ops->set_apic_access_page_addr = NULL; 7636 7637 if (!cpu_has_vmx_tpr_shadow()) 7638 kvm_x86_ops->update_cr8_intercept = NULL; 7639 7640 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 7641 kvm_disable_largepages(); 7642 7643 #if IS_ENABLED(CONFIG_HYPERV) 7644 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 7645 && enable_ept) { 7646 kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb; 7647 kvm_x86_ops->tlb_remote_flush_with_range = 7648 hv_remote_flush_tlb_with_range; 7649 } 7650 #endif 7651 7652 if (!cpu_has_vmx_ple()) { 7653 ple_gap = 0; 7654 ple_window = 0; 7655 ple_window_grow = 0; 7656 ple_window_max = 0; 7657 ple_window_shrink = 0; 7658 } 7659 7660 if (!cpu_has_vmx_apicv()) { 7661 enable_apicv = 0; 7662 kvm_x86_ops->sync_pir_to_irr = NULL; 7663 } 7664 7665 if (cpu_has_vmx_tsc_scaling()) { 7666 kvm_has_tsc_control = true; 7667 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 7668 kvm_tsc_scaling_ratio_frac_bits = 48; 7669 } 7670 7671 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 7672 7673 if (enable_ept) 7674 vmx_enable_tdp(); 7675 else 7676 kvm_disable_tdp(); 7677 7678 /* 7679 * Only enable PML when hardware supports PML feature, and both EPT 7680 * and EPT A/D bit features are enabled -- PML depends on them to work. 7681 */ 7682 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 7683 enable_pml = 0; 7684 7685 if (!enable_pml) { 7686 kvm_x86_ops->slot_enable_log_dirty = NULL; 7687 kvm_x86_ops->slot_disable_log_dirty = NULL; 7688 kvm_x86_ops->flush_log_dirty = NULL; 7689 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 7690 } 7691 7692 if (!cpu_has_vmx_preemption_timer()) 7693 enable_preemption_timer = false; 7694 7695 if (enable_preemption_timer) { 7696 u64 use_timer_freq = 5000ULL * 1000 * 1000; 7697 u64 vmx_msr; 7698 7699 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); 7700 cpu_preemption_timer_multi = 7701 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 7702 7703 if (tsc_khz) 7704 use_timer_freq = (u64)tsc_khz * 1000; 7705 use_timer_freq >>= cpu_preemption_timer_multi; 7706 7707 /* 7708 * KVM "disables" the preemption timer by setting it to its max 7709 * value. Don't use the timer if it might cause spurious exits 7710 * at a rate faster than 0.1 Hz (of uninterrupted guest time). 7711 */ 7712 if (use_timer_freq > 0xffffffffu / 10) 7713 enable_preemption_timer = false; 7714 } 7715 7716 if (!enable_preemption_timer) { 7717 kvm_x86_ops->set_hv_timer = NULL; 7718 kvm_x86_ops->cancel_hv_timer = NULL; 7719 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; 7720 } 7721 7722 kvm_set_posted_intr_wakeup_handler(wakeup_handler); 7723 7724 kvm_mce_cap_supported |= MCG_LMCE_P; 7725 7726 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 7727 return -EINVAL; 7728 if (!enable_ept || !cpu_has_vmx_intel_pt()) 7729 pt_mode = PT_MODE_SYSTEM; 7730 7731 if (nested) { 7732 nested_vmx_setup_ctls_msrs(&vmcs_config.nested, 7733 vmx_capability.ept, enable_apicv); 7734 7735 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 7736 if (r) 7737 return r; 7738 } 7739 7740 r = alloc_kvm_area(); 7741 if (r) 7742 nested_vmx_hardware_unsetup(); 7743 return r; 7744 } 7745 7746 static __exit void hardware_unsetup(void) 7747 { 7748 if (nested) 7749 nested_vmx_hardware_unsetup(); 7750 7751 free_kvm_area(); 7752 } 7753 7754 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { 7755 .cpu_has_kvm_support = cpu_has_kvm_support, 7756 .disabled_by_bios = vmx_disabled_by_bios, 7757 .hardware_setup = hardware_setup, 7758 .hardware_unsetup = hardware_unsetup, 7759 .check_processor_compatibility = vmx_check_processor_compat, 7760 .hardware_enable = hardware_enable, 7761 .hardware_disable = hardware_disable, 7762 .cpu_has_accelerated_tpr = report_flexpriority, 7763 .has_emulated_msr = vmx_has_emulated_msr, 7764 7765 .vm_init = vmx_vm_init, 7766 .vm_alloc = vmx_vm_alloc, 7767 .vm_free = vmx_vm_free, 7768 7769 .vcpu_create = vmx_create_vcpu, 7770 .vcpu_free = vmx_free_vcpu, 7771 .vcpu_reset = vmx_vcpu_reset, 7772 7773 .prepare_guest_switch = vmx_prepare_switch_to_guest, 7774 .vcpu_load = vmx_vcpu_load, 7775 .vcpu_put = vmx_vcpu_put, 7776 7777 .update_bp_intercept = update_exception_bitmap, 7778 .get_msr_feature = vmx_get_msr_feature, 7779 .get_msr = vmx_get_msr, 7780 .set_msr = vmx_set_msr, 7781 .get_segment_base = vmx_get_segment_base, 7782 .get_segment = vmx_get_segment, 7783 .set_segment = vmx_set_segment, 7784 .get_cpl = vmx_get_cpl, 7785 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 7786 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 7787 .decache_cr3 = vmx_decache_cr3, 7788 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 7789 .set_cr0 = vmx_set_cr0, 7790 .set_cr3 = vmx_set_cr3, 7791 .set_cr4 = vmx_set_cr4, 7792 .set_efer = vmx_set_efer, 7793 .get_idt = vmx_get_idt, 7794 .set_idt = vmx_set_idt, 7795 .get_gdt = vmx_get_gdt, 7796 .set_gdt = vmx_set_gdt, 7797 .get_dr6 = vmx_get_dr6, 7798 .set_dr6 = vmx_set_dr6, 7799 .set_dr7 = vmx_set_dr7, 7800 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, 7801 .cache_reg = vmx_cache_reg, 7802 .get_rflags = vmx_get_rflags, 7803 .set_rflags = vmx_set_rflags, 7804 7805 .tlb_flush = vmx_flush_tlb, 7806 .tlb_flush_gva = vmx_flush_tlb_gva, 7807 7808 .run = vmx_vcpu_run, 7809 .handle_exit = vmx_handle_exit, 7810 .skip_emulated_instruction = skip_emulated_instruction, 7811 .set_interrupt_shadow = vmx_set_interrupt_shadow, 7812 .get_interrupt_shadow = vmx_get_interrupt_shadow, 7813 .patch_hypercall = vmx_patch_hypercall, 7814 .set_irq = vmx_inject_irq, 7815 .set_nmi = vmx_inject_nmi, 7816 .queue_exception = vmx_queue_exception, 7817 .cancel_injection = vmx_cancel_injection, 7818 .interrupt_allowed = vmx_interrupt_allowed, 7819 .nmi_allowed = vmx_nmi_allowed, 7820 .get_nmi_mask = vmx_get_nmi_mask, 7821 .set_nmi_mask = vmx_set_nmi_mask, 7822 .enable_nmi_window = enable_nmi_window, 7823 .enable_irq_window = enable_irq_window, 7824 .update_cr8_intercept = update_cr8_intercept, 7825 .set_virtual_apic_mode = vmx_set_virtual_apic_mode, 7826 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 7827 .get_enable_apicv = vmx_get_enable_apicv, 7828 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, 7829 .load_eoi_exitmap = vmx_load_eoi_exitmap, 7830 .apicv_post_state_restore = vmx_apicv_post_state_restore, 7831 .hwapic_irr_update = vmx_hwapic_irr_update, 7832 .hwapic_isr_update = vmx_hwapic_isr_update, 7833 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, 7834 .sync_pir_to_irr = vmx_sync_pir_to_irr, 7835 .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 7836 .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, 7837 7838 .set_tss_addr = vmx_set_tss_addr, 7839 .set_identity_map_addr = vmx_set_identity_map_addr, 7840 .get_tdp_level = get_ept_level, 7841 .get_mt_mask = vmx_get_mt_mask, 7842 7843 .get_exit_info = vmx_get_exit_info, 7844 7845 .get_lpage_level = vmx_get_lpage_level, 7846 7847 .cpuid_update = vmx_cpuid_update, 7848 7849 .rdtscp_supported = vmx_rdtscp_supported, 7850 .invpcid_supported = vmx_invpcid_supported, 7851 7852 .set_supported_cpuid = vmx_set_supported_cpuid, 7853 7854 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 7855 7856 .read_l1_tsc_offset = vmx_read_l1_tsc_offset, 7857 .write_l1_tsc_offset = vmx_write_l1_tsc_offset, 7858 7859 .set_tdp_cr3 = vmx_set_cr3, 7860 7861 .check_intercept = vmx_check_intercept, 7862 .handle_exit_irqoff = vmx_handle_exit_irqoff, 7863 .mpx_supported = vmx_mpx_supported, 7864 .xsaves_supported = vmx_xsaves_supported, 7865 .umip_emulated = vmx_umip_emulated, 7866 .pt_supported = vmx_pt_supported, 7867 7868 .request_immediate_exit = vmx_request_immediate_exit, 7869 7870 .sched_in = vmx_sched_in, 7871 7872 .slot_enable_log_dirty = vmx_slot_enable_log_dirty, 7873 .slot_disable_log_dirty = vmx_slot_disable_log_dirty, 7874 .flush_log_dirty = vmx_flush_log_dirty, 7875 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 7876 .write_log_dirty = vmx_write_pml_buffer, 7877 7878 .pre_block = vmx_pre_block, 7879 .post_block = vmx_post_block, 7880 7881 .pmu_ops = &intel_pmu_ops, 7882 7883 .update_pi_irte = vmx_update_pi_irte, 7884 7885 #ifdef CONFIG_X86_64 7886 .set_hv_timer = vmx_set_hv_timer, 7887 .cancel_hv_timer = vmx_cancel_hv_timer, 7888 #endif 7889 7890 .setup_mce = vmx_setup_mce, 7891 7892 .smi_allowed = vmx_smi_allowed, 7893 .pre_enter_smm = vmx_pre_enter_smm, 7894 .pre_leave_smm = vmx_pre_leave_smm, 7895 .enable_smi_window = enable_smi_window, 7896 7897 .check_nested_events = NULL, 7898 .get_nested_state = NULL, 7899 .set_nested_state = NULL, 7900 .get_vmcs12_pages = NULL, 7901 .nested_enable_evmcs = NULL, 7902 .nested_get_evmcs_version = NULL, 7903 .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, 7904 .apic_init_signal_blocked = vmx_apic_init_signal_blocked, 7905 }; 7906 7907 static void vmx_cleanup_l1d_flush(void) 7908 { 7909 if (vmx_l1d_flush_pages) { 7910 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 7911 vmx_l1d_flush_pages = NULL; 7912 } 7913 /* Restore state so sysfs ignores VMX */ 7914 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 7915 } 7916 7917 static void vmx_exit(void) 7918 { 7919 #ifdef CONFIG_KEXEC_CORE 7920 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); 7921 synchronize_rcu(); 7922 #endif 7923 7924 kvm_exit(); 7925 7926 #if IS_ENABLED(CONFIG_HYPERV) 7927 if (static_branch_unlikely(&enable_evmcs)) { 7928 int cpu; 7929 struct hv_vp_assist_page *vp_ap; 7930 /* 7931 * Reset everything to support using non-enlightened VMCS 7932 * access later (e.g. when we reload the module with 7933 * enlightened_vmcs=0) 7934 */ 7935 for_each_online_cpu(cpu) { 7936 vp_ap = hv_get_vp_assist_page(cpu); 7937 7938 if (!vp_ap) 7939 continue; 7940 7941 vp_ap->nested_control.features.directhypercall = 0; 7942 vp_ap->current_nested_vmcs = 0; 7943 vp_ap->enlighten_vmentry = 0; 7944 } 7945 7946 static_branch_disable(&enable_evmcs); 7947 } 7948 #endif 7949 vmx_cleanup_l1d_flush(); 7950 } 7951 module_exit(vmx_exit); 7952 7953 static int __init vmx_init(void) 7954 { 7955 int r; 7956 7957 #if IS_ENABLED(CONFIG_HYPERV) 7958 /* 7959 * Enlightened VMCS usage should be recommended and the host needs 7960 * to support eVMCS v1 or above. We can also disable eVMCS support 7961 * with module parameter. 7962 */ 7963 if (enlightened_vmcs && 7964 ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 7965 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 7966 KVM_EVMCS_VERSION) { 7967 int cpu; 7968 7969 /* Check that we have assist pages on all online CPUs */ 7970 for_each_online_cpu(cpu) { 7971 if (!hv_get_vp_assist_page(cpu)) { 7972 enlightened_vmcs = false; 7973 break; 7974 } 7975 } 7976 7977 if (enlightened_vmcs) { 7978 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); 7979 static_branch_enable(&enable_evmcs); 7980 } 7981 7982 if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) 7983 vmx_x86_ops.enable_direct_tlbflush 7984 = hv_enable_direct_tlbflush; 7985 7986 } else { 7987 enlightened_vmcs = false; 7988 } 7989 #endif 7990 7991 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 7992 __alignof__(struct vcpu_vmx), THIS_MODULE); 7993 if (r) 7994 return r; 7995 7996 /* 7997 * Must be called after kvm_init() so enable_ept is properly set 7998 * up. Hand the parameter mitigation value in which was stored in 7999 * the pre module init parser. If no parameter was given, it will 8000 * contain 'auto' which will be turned into the default 'cond' 8001 * mitigation mode. 8002 */ 8003 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 8004 if (r) { 8005 vmx_exit(); 8006 return r; 8007 } 8008 8009 #ifdef CONFIG_KEXEC_CORE 8010 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 8011 crash_vmclear_local_loaded_vmcss); 8012 #endif 8013 vmx_check_vmcs12_offsets(); 8014 8015 return 0; 8016 } 8017 module_init(vmx_init); 8018