1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include <linux/frame.h> 20 #include <linux/highmem.h> 21 #include <linux/hrtimer.h> 22 #include <linux/kernel.h> 23 #include <linux/kvm_host.h> 24 #include <linux/module.h> 25 #include <linux/moduleparam.h> 26 #include <linux/mod_devicetable.h> 27 #include <linux/mm.h> 28 #include <linux/sched.h> 29 #include <linux/sched/smt.h> 30 #include <linux/slab.h> 31 #include <linux/tboot.h> 32 #include <linux/trace_events.h> 33 34 #include <asm/apic.h> 35 #include <asm/asm.h> 36 #include <asm/cpu.h> 37 #include <asm/debugreg.h> 38 #include <asm/desc.h> 39 #include <asm/fpu/internal.h> 40 #include <asm/io.h> 41 #include <asm/irq_remapping.h> 42 #include <asm/kexec.h> 43 #include <asm/perf_event.h> 44 #include <asm/mce.h> 45 #include <asm/mmu_context.h> 46 #include <asm/mshyperv.h> 47 #include <asm/spec-ctrl.h> 48 #include <asm/virtext.h> 49 #include <asm/vmx.h> 50 51 #include "capabilities.h" 52 #include "cpuid.h" 53 #include "evmcs.h" 54 #include "irq.h" 55 #include "kvm_cache_regs.h" 56 #include "lapic.h" 57 #include "mmu.h" 58 #include "nested.h" 59 #include "ops.h" 60 #include "pmu.h" 61 #include "trace.h" 62 #include "vmcs.h" 63 #include "vmcs12.h" 64 #include "vmx.h" 65 #include "x86.h" 66 67 MODULE_AUTHOR("Qumranet"); 68 MODULE_LICENSE("GPL"); 69 70 static const struct x86_cpu_id vmx_cpu_id[] = { 71 X86_FEATURE_MATCH(X86_FEATURE_VMX), 72 {} 73 }; 74 MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 75 76 bool __read_mostly enable_vpid = 1; 77 module_param_named(vpid, enable_vpid, bool, 0444); 78 79 static bool __read_mostly enable_vnmi = 1; 80 module_param_named(vnmi, enable_vnmi, bool, S_IRUGO); 81 82 bool __read_mostly flexpriority_enabled = 1; 83 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 84 85 bool __read_mostly enable_ept = 1; 86 module_param_named(ept, enable_ept, bool, S_IRUGO); 87 88 bool __read_mostly enable_unrestricted_guest = 1; 89 module_param_named(unrestricted_guest, 90 enable_unrestricted_guest, bool, S_IRUGO); 91 92 bool __read_mostly enable_ept_ad_bits = 1; 93 module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); 94 95 static bool __read_mostly emulate_invalid_guest_state = true; 96 module_param(emulate_invalid_guest_state, bool, S_IRUGO); 97 98 static bool __read_mostly fasteoi = 1; 99 module_param(fasteoi, bool, S_IRUGO); 100 101 static bool __read_mostly enable_apicv = 1; 102 module_param(enable_apicv, bool, S_IRUGO); 103 104 /* 105 * If nested=1, nested virtualization is supported, i.e., guests may use 106 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 107 * use VMX instructions. 108 */ 109 static bool __read_mostly nested = 1; 110 module_param(nested, bool, S_IRUGO); 111 112 static u64 __read_mostly host_xss; 113 114 bool __read_mostly enable_pml = 1; 115 module_param_named(pml, enable_pml, bool, S_IRUGO); 116 117 #define MSR_BITMAP_MODE_X2APIC 1 118 #define MSR_BITMAP_MODE_X2APIC_APICV 2 119 120 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL 121 122 /* Guest_tsc -> host_tsc conversion requires 64-bit division. */ 123 static int __read_mostly cpu_preemption_timer_multi; 124 static bool __read_mostly enable_preemption_timer = 1; 125 #ifdef CONFIG_X86_64 126 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); 127 #endif 128 129 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 130 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 131 #define KVM_VM_CR0_ALWAYS_ON \ 132 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ 133 X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) 134 #define KVM_CR4_GUEST_OWNED_BITS \ 135 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 136 | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD) 137 138 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE 139 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 140 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 141 142 #define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 143 144 #define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \ 145 RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \ 146 RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \ 147 RTIT_STATUS_BYTECNT)) 148 149 #define MSR_IA32_RTIT_OUTPUT_BASE_MASK \ 150 (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f) 151 152 /* 153 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 154 * ple_gap: upper bound on the amount of time between two successive 155 * executions of PAUSE in a loop. Also indicate if ple enabled. 156 * According to test, this time is usually smaller than 128 cycles. 157 * ple_window: upper bound on the amount of time a guest is allowed to execute 158 * in a PAUSE loop. Tests indicate that most spinlocks are held for 159 * less than 2^12 cycles 160 * Time is measured based on a counter that runs at the same rate as the TSC, 161 * refer SDM volume 3b section 21.6.13 & 22.1.3. 162 */ 163 static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP; 164 module_param(ple_gap, uint, 0444); 165 166 static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 167 module_param(ple_window, uint, 0444); 168 169 /* Default doubles per-vcpu window every exit. */ 170 static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 171 module_param(ple_window_grow, uint, 0444); 172 173 /* Default resets per-vcpu window every exit to ple_window. */ 174 static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 175 module_param(ple_window_shrink, uint, 0444); 176 177 /* Default is to compute the maximum so we can never overflow. */ 178 static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 179 module_param(ple_window_max, uint, 0444); 180 181 /* Default is SYSTEM mode, 1 for host-guest mode */ 182 int __read_mostly pt_mode = PT_MODE_SYSTEM; 183 module_param(pt_mode, int, S_IRUGO); 184 185 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); 186 static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); 187 static DEFINE_MUTEX(vmx_l1d_flush_mutex); 188 189 /* Storage for pre module init parameter parsing */ 190 static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO; 191 192 static const struct { 193 const char *option; 194 bool for_parse; 195 } vmentry_l1d_param[] = { 196 [VMENTER_L1D_FLUSH_AUTO] = {"auto", true}, 197 [VMENTER_L1D_FLUSH_NEVER] = {"never", true}, 198 [VMENTER_L1D_FLUSH_COND] = {"cond", true}, 199 [VMENTER_L1D_FLUSH_ALWAYS] = {"always", true}, 200 [VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false}, 201 [VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false}, 202 }; 203 204 #define L1D_CACHE_ORDER 4 205 static void *vmx_l1d_flush_pages; 206 207 static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) 208 { 209 struct page *page; 210 unsigned int i; 211 212 if (!enable_ept) { 213 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; 214 return 0; 215 } 216 217 if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { 218 u64 msr; 219 220 rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr); 221 if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) { 222 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; 223 return 0; 224 } 225 } 226 227 /* If set to auto use the default l1tf mitigation method */ 228 if (l1tf == VMENTER_L1D_FLUSH_AUTO) { 229 switch (l1tf_mitigation) { 230 case L1TF_MITIGATION_OFF: 231 l1tf = VMENTER_L1D_FLUSH_NEVER; 232 break; 233 case L1TF_MITIGATION_FLUSH_NOWARN: 234 case L1TF_MITIGATION_FLUSH: 235 case L1TF_MITIGATION_FLUSH_NOSMT: 236 l1tf = VMENTER_L1D_FLUSH_COND; 237 break; 238 case L1TF_MITIGATION_FULL: 239 case L1TF_MITIGATION_FULL_FORCE: 240 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 241 break; 242 } 243 } else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) { 244 l1tf = VMENTER_L1D_FLUSH_ALWAYS; 245 } 246 247 if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages && 248 !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) { 249 /* 250 * This allocation for vmx_l1d_flush_pages is not tied to a VM 251 * lifetime and so should not be charged to a memcg. 252 */ 253 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER); 254 if (!page) 255 return -ENOMEM; 256 vmx_l1d_flush_pages = page_address(page); 257 258 /* 259 * Initialize each page with a different pattern in 260 * order to protect against KSM in the nested 261 * virtualization case. 262 */ 263 for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) { 264 memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1, 265 PAGE_SIZE); 266 } 267 } 268 269 l1tf_vmx_mitigation = l1tf; 270 271 if (l1tf != VMENTER_L1D_FLUSH_NEVER) 272 static_branch_enable(&vmx_l1d_should_flush); 273 else 274 static_branch_disable(&vmx_l1d_should_flush); 275 276 if (l1tf == VMENTER_L1D_FLUSH_COND) 277 static_branch_enable(&vmx_l1d_flush_cond); 278 else 279 static_branch_disable(&vmx_l1d_flush_cond); 280 return 0; 281 } 282 283 static int vmentry_l1d_flush_parse(const char *s) 284 { 285 unsigned int i; 286 287 if (s) { 288 for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) { 289 if (vmentry_l1d_param[i].for_parse && 290 sysfs_streq(s, vmentry_l1d_param[i].option)) 291 return i; 292 } 293 } 294 return -EINVAL; 295 } 296 297 static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp) 298 { 299 int l1tf, ret; 300 301 l1tf = vmentry_l1d_flush_parse(s); 302 if (l1tf < 0) 303 return l1tf; 304 305 if (!boot_cpu_has(X86_BUG_L1TF)) 306 return 0; 307 308 /* 309 * Has vmx_init() run already? If not then this is the pre init 310 * parameter parsing. In that case just store the value and let 311 * vmx_init() do the proper setup after enable_ept has been 312 * established. 313 */ 314 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) { 315 vmentry_l1d_flush_param = l1tf; 316 return 0; 317 } 318 319 mutex_lock(&vmx_l1d_flush_mutex); 320 ret = vmx_setup_l1d_flush(l1tf); 321 mutex_unlock(&vmx_l1d_flush_mutex); 322 return ret; 323 } 324 325 static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp) 326 { 327 if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param))) 328 return sprintf(s, "???\n"); 329 330 return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option); 331 } 332 333 static const struct kernel_param_ops vmentry_l1d_flush_ops = { 334 .set = vmentry_l1d_flush_set, 335 .get = vmentry_l1d_flush_get, 336 }; 337 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644); 338 339 static bool guest_state_valid(struct kvm_vcpu *vcpu); 340 static u32 vmx_segment_access_rights(struct kvm_segment *var); 341 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, 342 u32 msr, int type); 343 344 void vmx_vmexit(void); 345 346 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 347 DEFINE_PER_CPU(struct vmcs *, current_vmcs); 348 /* 349 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 350 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 351 */ 352 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 353 354 /* 355 * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we 356 * can find which vCPU should be waken up. 357 */ 358 static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); 359 static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); 360 361 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 362 static DEFINE_SPINLOCK(vmx_vpid_lock); 363 364 struct vmcs_config vmcs_config; 365 struct vmx_capability vmx_capability; 366 367 #define VMX_SEGMENT_FIELD(seg) \ 368 [VCPU_SREG_##seg] = { \ 369 .selector = GUEST_##seg##_SELECTOR, \ 370 .base = GUEST_##seg##_BASE, \ 371 .limit = GUEST_##seg##_LIMIT, \ 372 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 373 } 374 375 static const struct kvm_vmx_segment_field { 376 unsigned selector; 377 unsigned base; 378 unsigned limit; 379 unsigned ar_bytes; 380 } kvm_vmx_segment_fields[] = { 381 VMX_SEGMENT_FIELD(CS), 382 VMX_SEGMENT_FIELD(DS), 383 VMX_SEGMENT_FIELD(ES), 384 VMX_SEGMENT_FIELD(FS), 385 VMX_SEGMENT_FIELD(GS), 386 VMX_SEGMENT_FIELD(SS), 387 VMX_SEGMENT_FIELD(TR), 388 VMX_SEGMENT_FIELD(LDTR), 389 }; 390 391 u64 host_efer; 392 393 /* 394 * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm 395 * will emulate SYSCALL in legacy mode if the vendor string in guest 396 * CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To 397 * support this emulation, IA32_STAR must always be included in 398 * vmx_msr_index[], even in i386 builds. 399 */ 400 const u32 vmx_msr_index[] = { 401 #ifdef CONFIG_X86_64 402 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 403 #endif 404 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 405 }; 406 407 #if IS_ENABLED(CONFIG_HYPERV) 408 static bool __read_mostly enlightened_vmcs = true; 409 module_param(enlightened_vmcs, bool, 0444); 410 411 /* check_ept_pointer() should be under protection of ept_pointer_lock. */ 412 static void check_ept_pointer_match(struct kvm *kvm) 413 { 414 struct kvm_vcpu *vcpu; 415 u64 tmp_eptp = INVALID_PAGE; 416 int i; 417 418 kvm_for_each_vcpu(i, vcpu, kvm) { 419 if (!VALID_PAGE(tmp_eptp)) { 420 tmp_eptp = to_vmx(vcpu)->ept_pointer; 421 } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { 422 to_kvm_vmx(kvm)->ept_pointers_match 423 = EPT_POINTERS_MISMATCH; 424 return; 425 } 426 } 427 428 to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; 429 } 430 431 static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush, 432 void *data) 433 { 434 struct kvm_tlb_range *range = data; 435 436 return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn, 437 range->pages); 438 } 439 440 static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm, 441 struct kvm_vcpu *vcpu, struct kvm_tlb_range *range) 442 { 443 u64 ept_pointer = to_vmx(vcpu)->ept_pointer; 444 445 /* 446 * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address 447 * of the base of EPT PML4 table, strip off EPT configuration 448 * information. 449 */ 450 if (range) 451 return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK, 452 kvm_fill_hv_flush_list_func, (void *)range); 453 else 454 return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK); 455 } 456 457 static int hv_remote_flush_tlb_with_range(struct kvm *kvm, 458 struct kvm_tlb_range *range) 459 { 460 struct kvm_vcpu *vcpu; 461 int ret = 0, i; 462 463 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); 464 465 if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) 466 check_ept_pointer_match(kvm); 467 468 if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { 469 kvm_for_each_vcpu(i, vcpu, kvm) { 470 /* If ept_pointer is invalid pointer, bypass flush request. */ 471 if (VALID_PAGE(to_vmx(vcpu)->ept_pointer)) 472 ret |= __hv_remote_flush_tlb_with_range( 473 kvm, vcpu, range); 474 } 475 } else { 476 ret = __hv_remote_flush_tlb_with_range(kvm, 477 kvm_get_vcpu(kvm, 0), range); 478 } 479 480 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); 481 return ret; 482 } 483 static int hv_remote_flush_tlb(struct kvm *kvm) 484 { 485 return hv_remote_flush_tlb_with_range(kvm, NULL); 486 } 487 488 #endif /* IS_ENABLED(CONFIG_HYPERV) */ 489 490 /* 491 * Comment's format: document - errata name - stepping - processor name. 492 * Refer from 493 * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp 494 */ 495 static u32 vmx_preemption_cpu_tfms[] = { 496 /* 323344.pdf - BA86 - D0 - Xeon 7500 Series */ 497 0x000206E6, 498 /* 323056.pdf - AAX65 - C2 - Xeon L3406 */ 499 /* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */ 500 /* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */ 501 0x00020652, 502 /* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */ 503 0x00020655, 504 /* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */ 505 /* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */ 506 /* 507 * 320767.pdf - AAP86 - B1 - 508 * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile 509 */ 510 0x000106E5, 511 /* 321333.pdf - AAM126 - C0 - Xeon 3500 */ 512 0x000106A0, 513 /* 321333.pdf - AAM126 - C1 - Xeon 3500 */ 514 0x000106A1, 515 /* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */ 516 0x000106A4, 517 /* 321333.pdf - AAM126 - D0 - Xeon 3500 */ 518 /* 321324.pdf - AAK139 - D0 - Xeon 5500 */ 519 /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */ 520 0x000106A5, 521 /* Xeon E3-1220 V2 */ 522 0x000306A8, 523 }; 524 525 static inline bool cpu_has_broken_vmx_preemption_timer(void) 526 { 527 u32 eax = cpuid_eax(0x00000001), i; 528 529 /* Clear the reserved bits */ 530 eax &= ~(0x3U << 14 | 0xfU << 28); 531 for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) 532 if (eax == vmx_preemption_cpu_tfms[i]) 533 return true; 534 535 return false; 536 } 537 538 static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) 539 { 540 return flexpriority_enabled && lapic_in_kernel(vcpu); 541 } 542 543 static inline bool report_flexpriority(void) 544 { 545 return flexpriority_enabled; 546 } 547 548 static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 549 { 550 int i; 551 552 for (i = 0; i < vmx->nmsrs; ++i) 553 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) 554 return i; 555 return -1; 556 } 557 558 struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 559 { 560 int i; 561 562 i = __find_msr_index(vmx, msr); 563 if (i >= 0) 564 return &vmx->guest_msrs[i]; 565 return NULL; 566 } 567 568 void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) 569 { 570 vmcs_clear(loaded_vmcs->vmcs); 571 if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) 572 vmcs_clear(loaded_vmcs->shadow_vmcs); 573 loaded_vmcs->cpu = -1; 574 loaded_vmcs->launched = 0; 575 } 576 577 #ifdef CONFIG_KEXEC_CORE 578 /* 579 * This bitmap is used to indicate whether the vmclear 580 * operation is enabled on all cpus. All disabled by 581 * default. 582 */ 583 static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; 584 585 static inline void crash_enable_local_vmclear(int cpu) 586 { 587 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); 588 } 589 590 static inline void crash_disable_local_vmclear(int cpu) 591 { 592 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); 593 } 594 595 static inline int crash_local_vmclear_enabled(int cpu) 596 { 597 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); 598 } 599 600 static void crash_vmclear_local_loaded_vmcss(void) 601 { 602 int cpu = raw_smp_processor_id(); 603 struct loaded_vmcs *v; 604 605 if (!crash_local_vmclear_enabled(cpu)) 606 return; 607 608 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 609 loaded_vmcss_on_cpu_link) 610 vmcs_clear(v->vmcs); 611 } 612 #else 613 static inline void crash_enable_local_vmclear(int cpu) { } 614 static inline void crash_disable_local_vmclear(int cpu) { } 615 #endif /* CONFIG_KEXEC_CORE */ 616 617 static void __loaded_vmcs_clear(void *arg) 618 { 619 struct loaded_vmcs *loaded_vmcs = arg; 620 int cpu = raw_smp_processor_id(); 621 622 if (loaded_vmcs->cpu != cpu) 623 return; /* vcpu migration can race with cpu offline */ 624 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 625 per_cpu(current_vmcs, cpu) = NULL; 626 crash_disable_local_vmclear(cpu); 627 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 628 629 /* 630 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link 631 * is before setting loaded_vmcs->vcpu to -1 which is done in 632 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist 633 * then adds the vmcs into percpu list before it is deleted. 634 */ 635 smp_wmb(); 636 637 loaded_vmcs_init(loaded_vmcs); 638 crash_enable_local_vmclear(cpu); 639 } 640 641 void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 642 { 643 int cpu = loaded_vmcs->cpu; 644 645 if (cpu != -1) 646 smp_call_function_single(cpu, 647 __loaded_vmcs_clear, loaded_vmcs, 1); 648 } 649 650 static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 651 unsigned field) 652 { 653 bool ret; 654 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 655 656 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { 657 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); 658 vmx->segment_cache.bitmask = 0; 659 } 660 ret = vmx->segment_cache.bitmask & mask; 661 vmx->segment_cache.bitmask |= mask; 662 return ret; 663 } 664 665 static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 666 { 667 u16 *p = &vmx->segment_cache.seg[seg].selector; 668 669 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 670 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 671 return *p; 672 } 673 674 static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 675 { 676 ulong *p = &vmx->segment_cache.seg[seg].base; 677 678 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 679 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 680 return *p; 681 } 682 683 static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 684 { 685 u32 *p = &vmx->segment_cache.seg[seg].limit; 686 687 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 688 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 689 return *p; 690 } 691 692 static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 693 { 694 u32 *p = &vmx->segment_cache.seg[seg].ar; 695 696 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 697 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 698 return *p; 699 } 700 701 void update_exception_bitmap(struct kvm_vcpu *vcpu) 702 { 703 u32 eb; 704 705 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 706 (1u << DB_VECTOR) | (1u << AC_VECTOR); 707 /* 708 * Guest access to VMware backdoor ports could legitimately 709 * trigger #GP because of TSS I/O permission bitmap. 710 * We intercept those #GP and allow access to them anyway 711 * as VMware does. 712 */ 713 if (enable_vmware_backdoor) 714 eb |= (1u << GP_VECTOR); 715 if ((vcpu->guest_debug & 716 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 717 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 718 eb |= 1u << BP_VECTOR; 719 if (to_vmx(vcpu)->rmode.vm86_active) 720 eb = ~0; 721 if (enable_ept) 722 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 723 724 /* When we are running a nested L2 guest and L1 specified for it a 725 * certain exception bitmap, we must trap the same exceptions and pass 726 * them to L1. When running L2, we will only handle the exceptions 727 * specified above if L1 did not want them. 728 */ 729 if (is_guest_mode(vcpu)) 730 eb |= get_vmcs12(vcpu)->exception_bitmap; 731 732 vmcs_write32(EXCEPTION_BITMAP, eb); 733 } 734 735 /* 736 * Check if MSR is intercepted for currently loaded MSR bitmap. 737 */ 738 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) 739 { 740 unsigned long *msr_bitmap; 741 int f = sizeof(unsigned long); 742 743 if (!cpu_has_vmx_msr_bitmap()) 744 return true; 745 746 msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; 747 748 if (msr <= 0x1fff) { 749 return !!test_bit(msr, msr_bitmap + 0x800 / f); 750 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 751 msr &= 0x1fff; 752 return !!test_bit(msr, msr_bitmap + 0xc00 / f); 753 } 754 755 return true; 756 } 757 758 static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 759 unsigned long entry, unsigned long exit) 760 { 761 vm_entry_controls_clearbit(vmx, entry); 762 vm_exit_controls_clearbit(vmx, exit); 763 } 764 765 static int find_msr(struct vmx_msrs *m, unsigned int msr) 766 { 767 unsigned int i; 768 769 for (i = 0; i < m->nr; ++i) { 770 if (m->val[i].index == msr) 771 return i; 772 } 773 return -ENOENT; 774 } 775 776 static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 777 { 778 int i; 779 struct msr_autoload *m = &vmx->msr_autoload; 780 781 switch (msr) { 782 case MSR_EFER: 783 if (cpu_has_load_ia32_efer()) { 784 clear_atomic_switch_msr_special(vmx, 785 VM_ENTRY_LOAD_IA32_EFER, 786 VM_EXIT_LOAD_IA32_EFER); 787 return; 788 } 789 break; 790 case MSR_CORE_PERF_GLOBAL_CTRL: 791 if (cpu_has_load_perf_global_ctrl()) { 792 clear_atomic_switch_msr_special(vmx, 793 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 794 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 795 return; 796 } 797 break; 798 } 799 i = find_msr(&m->guest, msr); 800 if (i < 0) 801 goto skip_guest; 802 --m->guest.nr; 803 m->guest.val[i] = m->guest.val[m->guest.nr]; 804 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 805 806 skip_guest: 807 i = find_msr(&m->host, msr); 808 if (i < 0) 809 return; 810 811 --m->host.nr; 812 m->host.val[i] = m->host.val[m->host.nr]; 813 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 814 } 815 816 static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 817 unsigned long entry, unsigned long exit, 818 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 819 u64 guest_val, u64 host_val) 820 { 821 vmcs_write64(guest_val_vmcs, guest_val); 822 if (host_val_vmcs != HOST_IA32_EFER) 823 vmcs_write64(host_val_vmcs, host_val); 824 vm_entry_controls_setbit(vmx, entry); 825 vm_exit_controls_setbit(vmx, exit); 826 } 827 828 static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 829 u64 guest_val, u64 host_val, bool entry_only) 830 { 831 int i, j = 0; 832 struct msr_autoload *m = &vmx->msr_autoload; 833 834 switch (msr) { 835 case MSR_EFER: 836 if (cpu_has_load_ia32_efer()) { 837 add_atomic_switch_msr_special(vmx, 838 VM_ENTRY_LOAD_IA32_EFER, 839 VM_EXIT_LOAD_IA32_EFER, 840 GUEST_IA32_EFER, 841 HOST_IA32_EFER, 842 guest_val, host_val); 843 return; 844 } 845 break; 846 case MSR_CORE_PERF_GLOBAL_CTRL: 847 if (cpu_has_load_perf_global_ctrl()) { 848 add_atomic_switch_msr_special(vmx, 849 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 850 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 851 GUEST_IA32_PERF_GLOBAL_CTRL, 852 HOST_IA32_PERF_GLOBAL_CTRL, 853 guest_val, host_val); 854 return; 855 } 856 break; 857 case MSR_IA32_PEBS_ENABLE: 858 /* PEBS needs a quiescent period after being disabled (to write 859 * a record). Disabling PEBS through VMX MSR swapping doesn't 860 * provide that period, so a CPU could write host's record into 861 * guest's memory. 862 */ 863 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 864 } 865 866 i = find_msr(&m->guest, msr); 867 if (!entry_only) 868 j = find_msr(&m->host, msr); 869 870 if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) || 871 (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) { 872 printk_once(KERN_WARNING "Not enough msr switch entries. " 873 "Can't add msr %x\n", msr); 874 return; 875 } 876 if (i < 0) { 877 i = m->guest.nr++; 878 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 879 } 880 m->guest.val[i].index = msr; 881 m->guest.val[i].value = guest_val; 882 883 if (entry_only) 884 return; 885 886 if (j < 0) { 887 j = m->host.nr++; 888 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 889 } 890 m->host.val[j].index = msr; 891 m->host.val[j].value = host_val; 892 } 893 894 static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) 895 { 896 u64 guest_efer = vmx->vcpu.arch.efer; 897 u64 ignore_bits = 0; 898 899 if (!enable_ept) { 900 /* 901 * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing 902 * host CPUID is more efficient than testing guest CPUID 903 * or CR4. Host SMEP is anyway a requirement for guest SMEP. 904 */ 905 if (boot_cpu_has(X86_FEATURE_SMEP)) 906 guest_efer |= EFER_NX; 907 else if (!(guest_efer & EFER_NX)) 908 ignore_bits |= EFER_NX; 909 } 910 911 /* 912 * LMA and LME handled by hardware; SCE meaningless outside long mode. 913 */ 914 ignore_bits |= EFER_SCE; 915 #ifdef CONFIG_X86_64 916 ignore_bits |= EFER_LMA | EFER_LME; 917 /* SCE is meaningful only in long mode on Intel */ 918 if (guest_efer & EFER_LMA) 919 ignore_bits &= ~(u64)EFER_SCE; 920 #endif 921 922 /* 923 * On EPT, we can't emulate NX, so we must switch EFER atomically. 924 * On CPUs that support "load IA32_EFER", always switch EFER 925 * atomically, since it's faster than switching it manually. 926 */ 927 if (cpu_has_load_ia32_efer() || 928 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { 929 if (!(guest_efer & EFER_LMA)) 930 guest_efer &= ~EFER_LME; 931 if (guest_efer != host_efer) 932 add_atomic_switch_msr(vmx, MSR_EFER, 933 guest_efer, host_efer, false); 934 else 935 clear_atomic_switch_msr(vmx, MSR_EFER); 936 return false; 937 } else { 938 clear_atomic_switch_msr(vmx, MSR_EFER); 939 940 guest_efer &= ~ignore_bits; 941 guest_efer |= host_efer & ignore_bits; 942 943 vmx->guest_msrs[efer_offset].data = guest_efer; 944 vmx->guest_msrs[efer_offset].mask = ~ignore_bits; 945 946 return true; 947 } 948 } 949 950 #ifdef CONFIG_X86_32 951 /* 952 * On 32-bit kernels, VM exits still load the FS and GS bases from the 953 * VMCS rather than the segment table. KVM uses this helper to figure 954 * out the current bases to poke them into the VMCS before entry. 955 */ 956 static unsigned long segment_base(u16 selector) 957 { 958 struct desc_struct *table; 959 unsigned long v; 960 961 if (!(selector & ~SEGMENT_RPL_MASK)) 962 return 0; 963 964 table = get_current_gdt_ro(); 965 966 if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { 967 u16 ldt_selector = kvm_read_ldt(); 968 969 if (!(ldt_selector & ~SEGMENT_RPL_MASK)) 970 return 0; 971 972 table = (struct desc_struct *)segment_base(ldt_selector); 973 } 974 v = get_desc_base(&table[selector >> 3]); 975 return v; 976 } 977 #endif 978 979 static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range) 980 { 981 u32 i; 982 983 wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 984 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 985 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 986 wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 987 for (i = 0; i < addr_range; i++) { 988 wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 989 wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 990 } 991 } 992 993 static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) 994 { 995 u32 i; 996 997 rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status); 998 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base); 999 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask); 1000 rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match); 1001 for (i = 0; i < addr_range; i++) { 1002 rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]); 1003 rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]); 1004 } 1005 } 1006 1007 static void pt_guest_enter(struct vcpu_vmx *vmx) 1008 { 1009 if (pt_mode == PT_MODE_SYSTEM) 1010 return; 1011 1012 /* 1013 * GUEST_IA32_RTIT_CTL is already set in the VMCS. 1014 * Save host state before VM entry. 1015 */ 1016 rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1017 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1018 wrmsrl(MSR_IA32_RTIT_CTL, 0); 1019 pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); 1020 pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); 1021 } 1022 } 1023 1024 static void pt_guest_exit(struct vcpu_vmx *vmx) 1025 { 1026 if (pt_mode == PT_MODE_SYSTEM) 1027 return; 1028 1029 if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { 1030 pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range); 1031 pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range); 1032 } 1033 1034 /* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */ 1035 wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl); 1036 } 1037 1038 void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1039 { 1040 struct vcpu_vmx *vmx = to_vmx(vcpu); 1041 struct vmcs_host_state *host_state; 1042 #ifdef CONFIG_X86_64 1043 int cpu = raw_smp_processor_id(); 1044 #endif 1045 unsigned long fs_base, gs_base; 1046 u16 fs_sel, gs_sel; 1047 int i; 1048 1049 vmx->req_immediate_exit = false; 1050 1051 /* 1052 * Note that guest MSRs to be saved/restored can also be changed 1053 * when guest state is loaded. This happens when guest transitions 1054 * to/from long-mode by setting MSR_EFER.LMA. 1055 */ 1056 if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) { 1057 vmx->guest_msrs_dirty = false; 1058 for (i = 0; i < vmx->save_nmsrs; ++i) 1059 kvm_set_shared_msr(vmx->guest_msrs[i].index, 1060 vmx->guest_msrs[i].data, 1061 vmx->guest_msrs[i].mask); 1062 1063 } 1064 1065 if (vmx->loaded_cpu_state) 1066 return; 1067 1068 vmx->loaded_cpu_state = vmx->loaded_vmcs; 1069 host_state = &vmx->loaded_cpu_state->host_state; 1070 1071 /* 1072 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1073 * allow segment selectors with cpl > 0 or ti == 1. 1074 */ 1075 host_state->ldt_sel = kvm_read_ldt(); 1076 1077 #ifdef CONFIG_X86_64 1078 savesegment(ds, host_state->ds_sel); 1079 savesegment(es, host_state->es_sel); 1080 1081 gs_base = cpu_kernelmode_gs_base(cpu); 1082 if (likely(is_64bit_mm(current->mm))) { 1083 save_fsgs_for_kvm(); 1084 fs_sel = current->thread.fsindex; 1085 gs_sel = current->thread.gsindex; 1086 fs_base = current->thread.fsbase; 1087 vmx->msr_host_kernel_gs_base = current->thread.gsbase; 1088 } else { 1089 savesegment(fs, fs_sel); 1090 savesegment(gs, gs_sel); 1091 fs_base = read_msr(MSR_FS_BASE); 1092 vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); 1093 } 1094 1095 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1096 #else 1097 savesegment(fs, fs_sel); 1098 savesegment(gs, gs_sel); 1099 fs_base = segment_base(fs_sel); 1100 gs_base = segment_base(gs_sel); 1101 #endif 1102 1103 if (unlikely(fs_sel != host_state->fs_sel)) { 1104 if (!(fs_sel & 7)) 1105 vmcs_write16(HOST_FS_SELECTOR, fs_sel); 1106 else 1107 vmcs_write16(HOST_FS_SELECTOR, 0); 1108 host_state->fs_sel = fs_sel; 1109 } 1110 if (unlikely(gs_sel != host_state->gs_sel)) { 1111 if (!(gs_sel & 7)) 1112 vmcs_write16(HOST_GS_SELECTOR, gs_sel); 1113 else 1114 vmcs_write16(HOST_GS_SELECTOR, 0); 1115 host_state->gs_sel = gs_sel; 1116 } 1117 if (unlikely(fs_base != host_state->fs_base)) { 1118 vmcs_writel(HOST_FS_BASE, fs_base); 1119 host_state->fs_base = fs_base; 1120 } 1121 if (unlikely(gs_base != host_state->gs_base)) { 1122 vmcs_writel(HOST_GS_BASE, gs_base); 1123 host_state->gs_base = gs_base; 1124 } 1125 } 1126 1127 static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) 1128 { 1129 struct vmcs_host_state *host_state; 1130 1131 if (!vmx->loaded_cpu_state) 1132 return; 1133 1134 WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); 1135 host_state = &vmx->loaded_cpu_state->host_state; 1136 1137 ++vmx->vcpu.stat.host_state_reload; 1138 vmx->loaded_cpu_state = NULL; 1139 1140 #ifdef CONFIG_X86_64 1141 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1142 #endif 1143 if (host_state->ldt_sel || (host_state->gs_sel & 7)) { 1144 kvm_load_ldt(host_state->ldt_sel); 1145 #ifdef CONFIG_X86_64 1146 load_gs_index(host_state->gs_sel); 1147 #else 1148 loadsegment(gs, host_state->gs_sel); 1149 #endif 1150 } 1151 if (host_state->fs_sel & 7) 1152 loadsegment(fs, host_state->fs_sel); 1153 #ifdef CONFIG_X86_64 1154 if (unlikely(host_state->ds_sel | host_state->es_sel)) { 1155 loadsegment(ds, host_state->ds_sel); 1156 loadsegment(es, host_state->es_sel); 1157 } 1158 #endif 1159 invalidate_tss_limit(); 1160 #ifdef CONFIG_X86_64 1161 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1162 #endif 1163 load_fixmap_gdt(raw_smp_processor_id()); 1164 } 1165 1166 #ifdef CONFIG_X86_64 1167 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) 1168 { 1169 preempt_disable(); 1170 if (vmx->loaded_cpu_state) 1171 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1172 preempt_enable(); 1173 return vmx->msr_guest_kernel_gs_base; 1174 } 1175 1176 static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) 1177 { 1178 preempt_disable(); 1179 if (vmx->loaded_cpu_state) 1180 wrmsrl(MSR_KERNEL_GS_BASE, data); 1181 preempt_enable(); 1182 vmx->msr_guest_kernel_gs_base = data; 1183 } 1184 #endif 1185 1186 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) 1187 { 1188 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 1189 struct pi_desc old, new; 1190 unsigned int dest; 1191 1192 /* 1193 * In case of hot-plug or hot-unplug, we may have to undo 1194 * vmx_vcpu_pi_put even if there is no assigned device. And we 1195 * always keep PI.NDST up to date for simplicity: it makes the 1196 * code easier, and CPU migration is not a fast path. 1197 */ 1198 if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) 1199 return; 1200 1201 /* The full case. */ 1202 do { 1203 old.control = new.control = pi_desc->control; 1204 1205 dest = cpu_physical_id(cpu); 1206 1207 if (x2apic_enabled()) 1208 new.ndst = dest; 1209 else 1210 new.ndst = (dest << 8) & 0xFF00; 1211 1212 new.sn = 0; 1213 } while (cmpxchg64(&pi_desc->control, old.control, 1214 new.control) != old.control); 1215 1216 /* 1217 * Clear SN before reading the bitmap. The VT-d firmware 1218 * writes the bitmap and reads SN atomically (5.2.3 in the 1219 * spec), so it doesn't really have a memory barrier that 1220 * pairs with this, but we cannot do that and we need one. 1221 */ 1222 smp_mb__after_atomic(); 1223 1224 if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) 1225 pi_set_on(pi_desc); 1226 } 1227 1228 /* 1229 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1230 * vcpu mutex is already taken. 1231 */ 1232 void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1233 { 1234 struct vcpu_vmx *vmx = to_vmx(vcpu); 1235 bool already_loaded = vmx->loaded_vmcs->cpu == cpu; 1236 1237 if (!already_loaded) { 1238 loaded_vmcs_clear(vmx->loaded_vmcs); 1239 local_irq_disable(); 1240 crash_disable_local_vmclear(cpu); 1241 1242 /* 1243 * Read loaded_vmcs->cpu should be before fetching 1244 * loaded_vmcs->loaded_vmcss_on_cpu_link. 1245 * See the comments in __loaded_vmcs_clear(). 1246 */ 1247 smp_rmb(); 1248 1249 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1250 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1251 crash_enable_local_vmclear(cpu); 1252 local_irq_enable(); 1253 } 1254 1255 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { 1256 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1257 vmcs_load(vmx->loaded_vmcs->vmcs); 1258 indirect_branch_prediction_barrier(); 1259 } 1260 1261 if (!already_loaded) { 1262 void *gdt = get_current_gdt_ro(); 1263 unsigned long sysenter_esp; 1264 1265 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1266 1267 /* 1268 * Linux uses per-cpu TSS and GDT, so set these when switching 1269 * processors. See 22.2.4. 1270 */ 1271 vmcs_writel(HOST_TR_BASE, 1272 (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss); 1273 vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ 1274 1275 /* 1276 * VM exits change the host TR limit to 0x67 after a VM 1277 * exit. This is okay, since 0x67 covers everything except 1278 * the IO bitmap and have have code to handle the IO bitmap 1279 * being lost after a VM exit. 1280 */ 1281 BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67); 1282 1283 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 1284 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1285 1286 vmx->loaded_vmcs->cpu = cpu; 1287 } 1288 1289 /* Setup TSC multiplier */ 1290 if (kvm_has_tsc_control && 1291 vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) 1292 decache_tsc_multiplier(vmx); 1293 1294 vmx_vcpu_pi_load(vcpu, cpu); 1295 vmx->host_pkru = read_pkru(); 1296 vmx->host_debugctlmsr = get_debugctlmsr(); 1297 } 1298 1299 static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) 1300 { 1301 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 1302 1303 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 1304 !irq_remapping_cap(IRQ_POSTING_CAP) || 1305 !kvm_vcpu_apicv_active(vcpu)) 1306 return; 1307 1308 /* Set SN when the vCPU is preempted */ 1309 if (vcpu->preempted) 1310 pi_set_sn(pi_desc); 1311 } 1312 1313 void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1314 { 1315 vmx_vcpu_pi_put(vcpu); 1316 1317 vmx_prepare_switch_to_host(to_vmx(vcpu)); 1318 } 1319 1320 static bool emulation_required(struct kvm_vcpu *vcpu) 1321 { 1322 return emulate_invalid_guest_state && !guest_state_valid(vcpu); 1323 } 1324 1325 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 1326 1327 unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 1328 { 1329 unsigned long rflags, save_rflags; 1330 1331 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { 1332 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 1333 rflags = vmcs_readl(GUEST_RFLAGS); 1334 if (to_vmx(vcpu)->rmode.vm86_active) { 1335 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1336 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 1337 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1338 } 1339 to_vmx(vcpu)->rflags = rflags; 1340 } 1341 return to_vmx(vcpu)->rflags; 1342 } 1343 1344 void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1345 { 1346 unsigned long old_rflags = vmx_get_rflags(vcpu); 1347 1348 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 1349 to_vmx(vcpu)->rflags = rflags; 1350 if (to_vmx(vcpu)->rmode.vm86_active) { 1351 to_vmx(vcpu)->rmode.save_rflags = rflags; 1352 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1353 } 1354 vmcs_writel(GUEST_RFLAGS, rflags); 1355 1356 if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) 1357 to_vmx(vcpu)->emulation_required = emulation_required(vcpu); 1358 } 1359 1360 u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 1361 { 1362 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1363 int ret = 0; 1364 1365 if (interruptibility & GUEST_INTR_STATE_STI) 1366 ret |= KVM_X86_SHADOW_INT_STI; 1367 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 1368 ret |= KVM_X86_SHADOW_INT_MOV_SS; 1369 1370 return ret; 1371 } 1372 1373 void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 1374 { 1375 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 1376 u32 interruptibility = interruptibility_old; 1377 1378 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 1379 1380 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 1381 interruptibility |= GUEST_INTR_STATE_MOV_SS; 1382 else if (mask & KVM_X86_SHADOW_INT_STI) 1383 interruptibility |= GUEST_INTR_STATE_STI; 1384 1385 if ((interruptibility != interruptibility_old)) 1386 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 1387 } 1388 1389 static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) 1390 { 1391 struct vcpu_vmx *vmx = to_vmx(vcpu); 1392 unsigned long value; 1393 1394 /* 1395 * Any MSR write that attempts to change bits marked reserved will 1396 * case a #GP fault. 1397 */ 1398 if (data & vmx->pt_desc.ctl_bitmask) 1399 return 1; 1400 1401 /* 1402 * Any attempt to modify IA32_RTIT_CTL while TraceEn is set will 1403 * result in a #GP unless the same write also clears TraceEn. 1404 */ 1405 if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) && 1406 ((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN)) 1407 return 1; 1408 1409 /* 1410 * WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit 1411 * and FabricEn would cause #GP, if 1412 * CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0 1413 */ 1414 if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) && 1415 !(data & RTIT_CTL_FABRIC_EN) && 1416 !intel_pt_validate_cap(vmx->pt_desc.caps, 1417 PT_CAP_single_range_output)) 1418 return 1; 1419 1420 /* 1421 * MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that 1422 * utilize encodings marked reserved will casue a #GP fault. 1423 */ 1424 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods); 1425 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) && 1426 !test_bit((data & RTIT_CTL_MTC_RANGE) >> 1427 RTIT_CTL_MTC_RANGE_OFFSET, &value)) 1428 return 1; 1429 value = intel_pt_validate_cap(vmx->pt_desc.caps, 1430 PT_CAP_cycle_thresholds); 1431 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1432 !test_bit((data & RTIT_CTL_CYC_THRESH) >> 1433 RTIT_CTL_CYC_THRESH_OFFSET, &value)) 1434 return 1; 1435 value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods); 1436 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) && 1437 !test_bit((data & RTIT_CTL_PSB_FREQ) >> 1438 RTIT_CTL_PSB_FREQ_OFFSET, &value)) 1439 return 1; 1440 1441 /* 1442 * If ADDRx_CFG is reserved or the encodings is >2 will 1443 * cause a #GP fault. 1444 */ 1445 value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET; 1446 if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2)) 1447 return 1; 1448 value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET; 1449 if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2)) 1450 return 1; 1451 value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET; 1452 if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2)) 1453 return 1; 1454 value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET; 1455 if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2)) 1456 return 1; 1457 1458 return 0; 1459 } 1460 1461 1462 static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 1463 { 1464 unsigned long rip; 1465 1466 rip = kvm_rip_read(vcpu); 1467 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 1468 kvm_rip_write(vcpu, rip); 1469 1470 /* skipping an emulated instruction also counts */ 1471 vmx_set_interrupt_shadow(vcpu, 0); 1472 } 1473 1474 static void vmx_clear_hlt(struct kvm_vcpu *vcpu) 1475 { 1476 /* 1477 * Ensure that we clear the HLT state in the VMCS. We don't need to 1478 * explicitly skip the instruction because if the HLT state is set, 1479 * then the instruction is already executing and RIP has already been 1480 * advanced. 1481 */ 1482 if (kvm_hlt_in_guest(vcpu->kvm) && 1483 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) 1484 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 1485 } 1486 1487 static void vmx_queue_exception(struct kvm_vcpu *vcpu) 1488 { 1489 struct vcpu_vmx *vmx = to_vmx(vcpu); 1490 unsigned nr = vcpu->arch.exception.nr; 1491 bool has_error_code = vcpu->arch.exception.has_error_code; 1492 u32 error_code = vcpu->arch.exception.error_code; 1493 u32 intr_info = nr | INTR_INFO_VALID_MASK; 1494 1495 kvm_deliver_exception_payload(vcpu); 1496 1497 if (has_error_code) { 1498 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 1499 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 1500 } 1501 1502 if (vmx->rmode.vm86_active) { 1503 int inc_eip = 0; 1504 if (kvm_exception_is_soft(nr)) 1505 inc_eip = vcpu->arch.event_exit_inst_len; 1506 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) 1507 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 1508 return; 1509 } 1510 1511 WARN_ON_ONCE(vmx->emulation_required); 1512 1513 if (kvm_exception_is_soft(nr)) { 1514 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1515 vmx->vcpu.arch.event_exit_inst_len); 1516 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 1517 } else 1518 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1519 1520 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1521 1522 vmx_clear_hlt(vcpu); 1523 } 1524 1525 static bool vmx_rdtscp_supported(void) 1526 { 1527 return cpu_has_vmx_rdtscp(); 1528 } 1529 1530 static bool vmx_invpcid_supported(void) 1531 { 1532 return cpu_has_vmx_invpcid(); 1533 } 1534 1535 /* 1536 * Swap MSR entry in host/guest MSR entry array. 1537 */ 1538 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 1539 { 1540 struct shared_msr_entry tmp; 1541 1542 tmp = vmx->guest_msrs[to]; 1543 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 1544 vmx->guest_msrs[from] = tmp; 1545 } 1546 1547 /* 1548 * Set up the vmcs to automatically save and restore system 1549 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 1550 * mode, as fiddling with msrs is very expensive. 1551 */ 1552 static void setup_msrs(struct vcpu_vmx *vmx) 1553 { 1554 int save_nmsrs, index; 1555 1556 save_nmsrs = 0; 1557 #ifdef CONFIG_X86_64 1558 /* 1559 * The SYSCALL MSRs are only needed on long mode guests, and only 1560 * when EFER.SCE is set. 1561 */ 1562 if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) { 1563 index = __find_msr_index(vmx, MSR_STAR); 1564 if (index >= 0) 1565 move_msr_up(vmx, index, save_nmsrs++); 1566 index = __find_msr_index(vmx, MSR_LSTAR); 1567 if (index >= 0) 1568 move_msr_up(vmx, index, save_nmsrs++); 1569 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 1570 if (index >= 0) 1571 move_msr_up(vmx, index, save_nmsrs++); 1572 } 1573 #endif 1574 index = __find_msr_index(vmx, MSR_EFER); 1575 if (index >= 0 && update_transition_efer(vmx, index)) 1576 move_msr_up(vmx, index, save_nmsrs++); 1577 index = __find_msr_index(vmx, MSR_TSC_AUX); 1578 if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) 1579 move_msr_up(vmx, index, save_nmsrs++); 1580 1581 vmx->save_nmsrs = save_nmsrs; 1582 vmx->guest_msrs_dirty = true; 1583 1584 if (cpu_has_vmx_msr_bitmap()) 1585 vmx_update_msr_bitmap(&vmx->vcpu); 1586 } 1587 1588 static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) 1589 { 1590 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1591 1592 if (is_guest_mode(vcpu) && 1593 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) 1594 return vcpu->arch.tsc_offset - vmcs12->tsc_offset; 1595 1596 return vcpu->arch.tsc_offset; 1597 } 1598 1599 static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1600 { 1601 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1602 u64 g_tsc_offset = 0; 1603 1604 /* 1605 * We're here if L1 chose not to trap WRMSR to TSC. According 1606 * to the spec, this should set L1's TSC; The offset that L1 1607 * set for L2 remains unchanged, and still needs to be added 1608 * to the newly set TSC to get L2's TSC. 1609 */ 1610 if (is_guest_mode(vcpu) && 1611 (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)) 1612 g_tsc_offset = vmcs12->tsc_offset; 1613 1614 trace_kvm_write_tsc_offset(vcpu->vcpu_id, 1615 vcpu->arch.tsc_offset - g_tsc_offset, 1616 offset); 1617 vmcs_write64(TSC_OFFSET, offset + g_tsc_offset); 1618 return offset + g_tsc_offset; 1619 } 1620 1621 /* 1622 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX 1623 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for 1624 * all guests if the "nested" module option is off, and can also be disabled 1625 * for a single guest by disabling its VMX cpuid bit. 1626 */ 1627 bool nested_vmx_allowed(struct kvm_vcpu *vcpu) 1628 { 1629 return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX); 1630 } 1631 1632 static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, 1633 uint64_t val) 1634 { 1635 uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; 1636 1637 return !(val & ~valid_bits); 1638 } 1639 1640 static int vmx_get_msr_feature(struct kvm_msr_entry *msr) 1641 { 1642 switch (msr->index) { 1643 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 1644 if (!nested) 1645 return 1; 1646 return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); 1647 default: 1648 return 1; 1649 } 1650 1651 return 0; 1652 } 1653 1654 /* 1655 * Reads an msr value (of 'msr_index') into 'pdata'. 1656 * Returns 0 on success, non-0 otherwise. 1657 * Assumes vcpu_load() was already called. 1658 */ 1659 static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1660 { 1661 struct vcpu_vmx *vmx = to_vmx(vcpu); 1662 struct shared_msr_entry *msr; 1663 u32 index; 1664 1665 switch (msr_info->index) { 1666 #ifdef CONFIG_X86_64 1667 case MSR_FS_BASE: 1668 msr_info->data = vmcs_readl(GUEST_FS_BASE); 1669 break; 1670 case MSR_GS_BASE: 1671 msr_info->data = vmcs_readl(GUEST_GS_BASE); 1672 break; 1673 case MSR_KERNEL_GS_BASE: 1674 msr_info->data = vmx_read_guest_kernel_gs_base(vmx); 1675 break; 1676 #endif 1677 case MSR_EFER: 1678 return kvm_get_msr_common(vcpu, msr_info); 1679 case MSR_IA32_SPEC_CTRL: 1680 if (!msr_info->host_initiated && 1681 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 1682 return 1; 1683 1684 msr_info->data = to_vmx(vcpu)->spec_ctrl; 1685 break; 1686 case MSR_IA32_SYSENTER_CS: 1687 msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); 1688 break; 1689 case MSR_IA32_SYSENTER_EIP: 1690 msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); 1691 break; 1692 case MSR_IA32_SYSENTER_ESP: 1693 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); 1694 break; 1695 case MSR_IA32_BNDCFGS: 1696 if (!kvm_mpx_supported() || 1697 (!msr_info->host_initiated && 1698 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 1699 return 1; 1700 msr_info->data = vmcs_read64(GUEST_BNDCFGS); 1701 break; 1702 case MSR_IA32_MCG_EXT_CTL: 1703 if (!msr_info->host_initiated && 1704 !(vmx->msr_ia32_feature_control & 1705 FEATURE_CONTROL_LMCE)) 1706 return 1; 1707 msr_info->data = vcpu->arch.mcg_ext_ctl; 1708 break; 1709 case MSR_IA32_FEATURE_CONTROL: 1710 msr_info->data = vmx->msr_ia32_feature_control; 1711 break; 1712 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 1713 if (!nested_vmx_allowed(vcpu)) 1714 return 1; 1715 return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, 1716 &msr_info->data); 1717 case MSR_IA32_XSS: 1718 if (!vmx_xsaves_supported()) 1719 return 1; 1720 msr_info->data = vcpu->arch.ia32_xss; 1721 break; 1722 case MSR_IA32_RTIT_CTL: 1723 if (pt_mode != PT_MODE_HOST_GUEST) 1724 return 1; 1725 msr_info->data = vmx->pt_desc.guest.ctl; 1726 break; 1727 case MSR_IA32_RTIT_STATUS: 1728 if (pt_mode != PT_MODE_HOST_GUEST) 1729 return 1; 1730 msr_info->data = vmx->pt_desc.guest.status; 1731 break; 1732 case MSR_IA32_RTIT_CR3_MATCH: 1733 if ((pt_mode != PT_MODE_HOST_GUEST) || 1734 !intel_pt_validate_cap(vmx->pt_desc.caps, 1735 PT_CAP_cr3_filtering)) 1736 return 1; 1737 msr_info->data = vmx->pt_desc.guest.cr3_match; 1738 break; 1739 case MSR_IA32_RTIT_OUTPUT_BASE: 1740 if ((pt_mode != PT_MODE_HOST_GUEST) || 1741 (!intel_pt_validate_cap(vmx->pt_desc.caps, 1742 PT_CAP_topa_output) && 1743 !intel_pt_validate_cap(vmx->pt_desc.caps, 1744 PT_CAP_single_range_output))) 1745 return 1; 1746 msr_info->data = vmx->pt_desc.guest.output_base; 1747 break; 1748 case MSR_IA32_RTIT_OUTPUT_MASK: 1749 if ((pt_mode != PT_MODE_HOST_GUEST) || 1750 (!intel_pt_validate_cap(vmx->pt_desc.caps, 1751 PT_CAP_topa_output) && 1752 !intel_pt_validate_cap(vmx->pt_desc.caps, 1753 PT_CAP_single_range_output))) 1754 return 1; 1755 msr_info->data = vmx->pt_desc.guest.output_mask; 1756 break; 1757 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 1758 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 1759 if ((pt_mode != PT_MODE_HOST_GUEST) || 1760 (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, 1761 PT_CAP_num_address_ranges))) 1762 return 1; 1763 if (index % 2) 1764 msr_info->data = vmx->pt_desc.guest.addr_b[index / 2]; 1765 else 1766 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; 1767 break; 1768 case MSR_TSC_AUX: 1769 if (!msr_info->host_initiated && 1770 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 1771 return 1; 1772 /* Else, falls through */ 1773 default: 1774 msr = find_msr_entry(vmx, msr_info->index); 1775 if (msr) { 1776 msr_info->data = msr->data; 1777 break; 1778 } 1779 return kvm_get_msr_common(vcpu, msr_info); 1780 } 1781 1782 return 0; 1783 } 1784 1785 /* 1786 * Writes msr value into into the appropriate "register". 1787 * Returns 0 on success, non-0 otherwise. 1788 * Assumes vcpu_load() was already called. 1789 */ 1790 static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1791 { 1792 struct vcpu_vmx *vmx = to_vmx(vcpu); 1793 struct shared_msr_entry *msr; 1794 int ret = 0; 1795 u32 msr_index = msr_info->index; 1796 u64 data = msr_info->data; 1797 u32 index; 1798 1799 switch (msr_index) { 1800 case MSR_EFER: 1801 ret = kvm_set_msr_common(vcpu, msr_info); 1802 break; 1803 #ifdef CONFIG_X86_64 1804 case MSR_FS_BASE: 1805 vmx_segment_cache_clear(vmx); 1806 vmcs_writel(GUEST_FS_BASE, data); 1807 break; 1808 case MSR_GS_BASE: 1809 vmx_segment_cache_clear(vmx); 1810 vmcs_writel(GUEST_GS_BASE, data); 1811 break; 1812 case MSR_KERNEL_GS_BASE: 1813 vmx_write_guest_kernel_gs_base(vmx, data); 1814 break; 1815 #endif 1816 case MSR_IA32_SYSENTER_CS: 1817 vmcs_write32(GUEST_SYSENTER_CS, data); 1818 break; 1819 case MSR_IA32_SYSENTER_EIP: 1820 vmcs_writel(GUEST_SYSENTER_EIP, data); 1821 break; 1822 case MSR_IA32_SYSENTER_ESP: 1823 vmcs_writel(GUEST_SYSENTER_ESP, data); 1824 break; 1825 case MSR_IA32_BNDCFGS: 1826 if (!kvm_mpx_supported() || 1827 (!msr_info->host_initiated && 1828 !guest_cpuid_has(vcpu, X86_FEATURE_MPX))) 1829 return 1; 1830 if (is_noncanonical_address(data & PAGE_MASK, vcpu) || 1831 (data & MSR_IA32_BNDCFGS_RSVD)) 1832 return 1; 1833 vmcs_write64(GUEST_BNDCFGS, data); 1834 break; 1835 case MSR_IA32_SPEC_CTRL: 1836 if (!msr_info->host_initiated && 1837 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 1838 return 1; 1839 1840 /* The STIBP bit doesn't fault even if it's not advertised */ 1841 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) 1842 return 1; 1843 1844 vmx->spec_ctrl = data; 1845 1846 if (!data) 1847 break; 1848 1849 /* 1850 * For non-nested: 1851 * When it's written (to non-zero) for the first time, pass 1852 * it through. 1853 * 1854 * For nested: 1855 * The handling of the MSR bitmap for L2 guests is done in 1856 * nested_vmx_merge_msr_bitmap. We should not touch the 1857 * vmcs02.msr_bitmap here since it gets completely overwritten 1858 * in the merging. We update the vmcs01 here for L1 as well 1859 * since it will end up touching the MSR anyway now. 1860 */ 1861 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, 1862 MSR_IA32_SPEC_CTRL, 1863 MSR_TYPE_RW); 1864 break; 1865 case MSR_IA32_PRED_CMD: 1866 if (!msr_info->host_initiated && 1867 !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 1868 return 1; 1869 1870 if (data & ~PRED_CMD_IBPB) 1871 return 1; 1872 1873 if (!data) 1874 break; 1875 1876 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); 1877 1878 /* 1879 * For non-nested: 1880 * When it's written (to non-zero) for the first time, pass 1881 * it through. 1882 * 1883 * For nested: 1884 * The handling of the MSR bitmap for L2 guests is done in 1885 * nested_vmx_merge_msr_bitmap. We should not touch the 1886 * vmcs02.msr_bitmap here since it gets completely overwritten 1887 * in the merging. 1888 */ 1889 vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD, 1890 MSR_TYPE_W); 1891 break; 1892 case MSR_IA32_CR_PAT: 1893 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1894 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 1895 return 1; 1896 vmcs_write64(GUEST_IA32_PAT, data); 1897 vcpu->arch.pat = data; 1898 break; 1899 } 1900 ret = kvm_set_msr_common(vcpu, msr_info); 1901 break; 1902 case MSR_IA32_TSC_ADJUST: 1903 ret = kvm_set_msr_common(vcpu, msr_info); 1904 break; 1905 case MSR_IA32_MCG_EXT_CTL: 1906 if ((!msr_info->host_initiated && 1907 !(to_vmx(vcpu)->msr_ia32_feature_control & 1908 FEATURE_CONTROL_LMCE)) || 1909 (data & ~MCG_EXT_CTL_LMCE_EN)) 1910 return 1; 1911 vcpu->arch.mcg_ext_ctl = data; 1912 break; 1913 case MSR_IA32_FEATURE_CONTROL: 1914 if (!vmx_feature_control_msr_valid(vcpu, data) || 1915 (to_vmx(vcpu)->msr_ia32_feature_control & 1916 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) 1917 return 1; 1918 vmx->msr_ia32_feature_control = data; 1919 if (msr_info->host_initiated && data == 0) 1920 vmx_leave_nested(vcpu); 1921 break; 1922 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 1923 if (!msr_info->host_initiated) 1924 return 1; /* they are read-only */ 1925 if (!nested_vmx_allowed(vcpu)) 1926 return 1; 1927 return vmx_set_vmx_msr(vcpu, msr_index, data); 1928 case MSR_IA32_XSS: 1929 if (!vmx_xsaves_supported()) 1930 return 1; 1931 /* 1932 * The only supported bit as of Skylake is bit 8, but 1933 * it is not supported on KVM. 1934 */ 1935 if (data != 0) 1936 return 1; 1937 vcpu->arch.ia32_xss = data; 1938 if (vcpu->arch.ia32_xss != host_xss) 1939 add_atomic_switch_msr(vmx, MSR_IA32_XSS, 1940 vcpu->arch.ia32_xss, host_xss, false); 1941 else 1942 clear_atomic_switch_msr(vmx, MSR_IA32_XSS); 1943 break; 1944 case MSR_IA32_RTIT_CTL: 1945 if ((pt_mode != PT_MODE_HOST_GUEST) || 1946 vmx_rtit_ctl_check(vcpu, data) || 1947 vmx->nested.vmxon) 1948 return 1; 1949 vmcs_write64(GUEST_IA32_RTIT_CTL, data); 1950 vmx->pt_desc.guest.ctl = data; 1951 pt_update_intercept_for_msr(vmx); 1952 break; 1953 case MSR_IA32_RTIT_STATUS: 1954 if ((pt_mode != PT_MODE_HOST_GUEST) || 1955 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 1956 (data & MSR_IA32_RTIT_STATUS_MASK)) 1957 return 1; 1958 vmx->pt_desc.guest.status = data; 1959 break; 1960 case MSR_IA32_RTIT_CR3_MATCH: 1961 if ((pt_mode != PT_MODE_HOST_GUEST) || 1962 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 1963 !intel_pt_validate_cap(vmx->pt_desc.caps, 1964 PT_CAP_cr3_filtering)) 1965 return 1; 1966 vmx->pt_desc.guest.cr3_match = data; 1967 break; 1968 case MSR_IA32_RTIT_OUTPUT_BASE: 1969 if ((pt_mode != PT_MODE_HOST_GUEST) || 1970 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 1971 (!intel_pt_validate_cap(vmx->pt_desc.caps, 1972 PT_CAP_topa_output) && 1973 !intel_pt_validate_cap(vmx->pt_desc.caps, 1974 PT_CAP_single_range_output)) || 1975 (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)) 1976 return 1; 1977 vmx->pt_desc.guest.output_base = data; 1978 break; 1979 case MSR_IA32_RTIT_OUTPUT_MASK: 1980 if ((pt_mode != PT_MODE_HOST_GUEST) || 1981 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 1982 (!intel_pt_validate_cap(vmx->pt_desc.caps, 1983 PT_CAP_topa_output) && 1984 !intel_pt_validate_cap(vmx->pt_desc.caps, 1985 PT_CAP_single_range_output))) 1986 return 1; 1987 vmx->pt_desc.guest.output_mask = data; 1988 break; 1989 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 1990 index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; 1991 if ((pt_mode != PT_MODE_HOST_GUEST) || 1992 (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) || 1993 (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, 1994 PT_CAP_num_address_ranges))) 1995 return 1; 1996 if (index % 2) 1997 vmx->pt_desc.guest.addr_b[index / 2] = data; 1998 else 1999 vmx->pt_desc.guest.addr_a[index / 2] = data; 2000 break; 2001 case MSR_TSC_AUX: 2002 if (!msr_info->host_initiated && 2003 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 2004 return 1; 2005 /* Check reserved bit, higher 32 bits should be zero */ 2006 if ((data >> 32) != 0) 2007 return 1; 2008 /* Else, falls through */ 2009 default: 2010 msr = find_msr_entry(vmx, msr_index); 2011 if (msr) { 2012 u64 old_msr_data = msr->data; 2013 msr->data = data; 2014 if (msr - vmx->guest_msrs < vmx->save_nmsrs) { 2015 preempt_disable(); 2016 ret = kvm_set_shared_msr(msr->index, msr->data, 2017 msr->mask); 2018 preempt_enable(); 2019 if (ret) 2020 msr->data = old_msr_data; 2021 } 2022 break; 2023 } 2024 ret = kvm_set_msr_common(vcpu, msr_info); 2025 } 2026 2027 return ret; 2028 } 2029 2030 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2031 { 2032 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); 2033 switch (reg) { 2034 case VCPU_REGS_RSP: 2035 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2036 break; 2037 case VCPU_REGS_RIP: 2038 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2039 break; 2040 case VCPU_EXREG_PDPTR: 2041 if (enable_ept) 2042 ept_save_pdptrs(vcpu); 2043 break; 2044 default: 2045 break; 2046 } 2047 } 2048 2049 static __init int cpu_has_kvm_support(void) 2050 { 2051 return cpu_has_vmx(); 2052 } 2053 2054 static __init int vmx_disabled_by_bios(void) 2055 { 2056 u64 msr; 2057 2058 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 2059 if (msr & FEATURE_CONTROL_LOCKED) { 2060 /* launched w/ TXT and VMX disabled */ 2061 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2062 && tboot_enabled()) 2063 return 1; 2064 /* launched w/o TXT and VMX only enabled w/ TXT */ 2065 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2066 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2067 && !tboot_enabled()) { 2068 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 2069 "activate TXT before enabling KVM\n"); 2070 return 1; 2071 } 2072 /* launched w/o TXT and VMX disabled */ 2073 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2074 && !tboot_enabled()) 2075 return 1; 2076 } 2077 2078 return 0; 2079 } 2080 2081 static void kvm_cpu_vmxon(u64 addr) 2082 { 2083 cr4_set_bits(X86_CR4_VMXE); 2084 intel_pt_handle_vmx(1); 2085 2086 asm volatile ("vmxon %0" : : "m"(addr)); 2087 } 2088 2089 static int hardware_enable(void) 2090 { 2091 int cpu = raw_smp_processor_id(); 2092 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2093 u64 old, test_bits; 2094 2095 if (cr4_read_shadow() & X86_CR4_VMXE) 2096 return -EBUSY; 2097 2098 /* 2099 * This can happen if we hot-added a CPU but failed to allocate 2100 * VP assist page for it. 2101 */ 2102 if (static_branch_unlikely(&enable_evmcs) && 2103 !hv_get_vp_assist_page(cpu)) 2104 return -EFAULT; 2105 2106 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2107 INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); 2108 spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 2109 2110 /* 2111 * Now we can enable the vmclear operation in kdump 2112 * since the loaded_vmcss_on_cpu list on this cpu 2113 * has been initialized. 2114 * 2115 * Though the cpu is not in VMX operation now, there 2116 * is no problem to enable the vmclear operation 2117 * for the loaded_vmcss_on_cpu list is empty! 2118 */ 2119 crash_enable_local_vmclear(cpu); 2120 2121 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2122 2123 test_bits = FEATURE_CONTROL_LOCKED; 2124 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 2125 if (tboot_enabled()) 2126 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; 2127 2128 if ((old & test_bits) != test_bits) { 2129 /* enable and lock */ 2130 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 2131 } 2132 kvm_cpu_vmxon(phys_addr); 2133 if (enable_ept) 2134 ept_sync_global(); 2135 2136 return 0; 2137 } 2138 2139 static void vmclear_local_loaded_vmcss(void) 2140 { 2141 int cpu = raw_smp_processor_id(); 2142 struct loaded_vmcs *v, *n; 2143 2144 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2145 loaded_vmcss_on_cpu_link) 2146 __loaded_vmcs_clear(v); 2147 } 2148 2149 2150 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() 2151 * tricks. 2152 */ 2153 static void kvm_cpu_vmxoff(void) 2154 { 2155 asm volatile (__ex("vmxoff")); 2156 2157 intel_pt_handle_vmx(0); 2158 cr4_clear_bits(X86_CR4_VMXE); 2159 } 2160 2161 static void hardware_disable(void) 2162 { 2163 vmclear_local_loaded_vmcss(); 2164 kvm_cpu_vmxoff(); 2165 } 2166 2167 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 2168 u32 msr, u32 *result) 2169 { 2170 u32 vmx_msr_low, vmx_msr_high; 2171 u32 ctl = ctl_min | ctl_opt; 2172 2173 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2174 2175 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2176 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2177 2178 /* Ensure minimum (required) set of control bits are supported. */ 2179 if (ctl_min & ~ctl) 2180 return -EIO; 2181 2182 *result = ctl; 2183 return 0; 2184 } 2185 2186 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf, 2187 struct vmx_capability *vmx_cap) 2188 { 2189 u32 vmx_msr_low, vmx_msr_high; 2190 u32 min, opt, min2, opt2; 2191 u32 _pin_based_exec_control = 0; 2192 u32 _cpu_based_exec_control = 0; 2193 u32 _cpu_based_2nd_exec_control = 0; 2194 u32 _vmexit_control = 0; 2195 u32 _vmentry_control = 0; 2196 2197 memset(vmcs_conf, 0, sizeof(*vmcs_conf)); 2198 min = CPU_BASED_HLT_EXITING | 2199 #ifdef CONFIG_X86_64 2200 CPU_BASED_CR8_LOAD_EXITING | 2201 CPU_BASED_CR8_STORE_EXITING | 2202 #endif 2203 CPU_BASED_CR3_LOAD_EXITING | 2204 CPU_BASED_CR3_STORE_EXITING | 2205 CPU_BASED_UNCOND_IO_EXITING | 2206 CPU_BASED_MOV_DR_EXITING | 2207 CPU_BASED_USE_TSC_OFFSETING | 2208 CPU_BASED_MWAIT_EXITING | 2209 CPU_BASED_MONITOR_EXITING | 2210 CPU_BASED_INVLPG_EXITING | 2211 CPU_BASED_RDPMC_EXITING; 2212 2213 opt = CPU_BASED_TPR_SHADOW | 2214 CPU_BASED_USE_MSR_BITMAPS | 2215 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2216 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 2217 &_cpu_based_exec_control) < 0) 2218 return -EIO; 2219 #ifdef CONFIG_X86_64 2220 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2221 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 2222 ~CPU_BASED_CR8_STORE_EXITING; 2223 #endif 2224 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 2225 min2 = 0; 2226 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2227 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2228 SECONDARY_EXEC_WBINVD_EXITING | 2229 SECONDARY_EXEC_ENABLE_VPID | 2230 SECONDARY_EXEC_ENABLE_EPT | 2231 SECONDARY_EXEC_UNRESTRICTED_GUEST | 2232 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 2233 SECONDARY_EXEC_DESC | 2234 SECONDARY_EXEC_RDTSCP | 2235 SECONDARY_EXEC_ENABLE_INVPCID | 2236 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2237 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2238 SECONDARY_EXEC_SHADOW_VMCS | 2239 SECONDARY_EXEC_XSAVES | 2240 SECONDARY_EXEC_RDSEED_EXITING | 2241 SECONDARY_EXEC_RDRAND_EXITING | 2242 SECONDARY_EXEC_ENABLE_PML | 2243 SECONDARY_EXEC_TSC_SCALING | 2244 SECONDARY_EXEC_PT_USE_GPA | 2245 SECONDARY_EXEC_PT_CONCEAL_VMX | 2246 SECONDARY_EXEC_ENABLE_VMFUNC | 2247 SECONDARY_EXEC_ENCLS_EXITING; 2248 if (adjust_vmx_controls(min2, opt2, 2249 MSR_IA32_VMX_PROCBASED_CTLS2, 2250 &_cpu_based_2nd_exec_control) < 0) 2251 return -EIO; 2252 } 2253 #ifndef CONFIG_X86_64 2254 if (!(_cpu_based_2nd_exec_control & 2255 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 2256 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 2257 #endif 2258 2259 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 2260 _cpu_based_2nd_exec_control &= ~( 2261 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2262 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2263 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 2264 2265 rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, 2266 &vmx_cap->ept, &vmx_cap->vpid); 2267 2268 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 2269 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 2270 enabled */ 2271 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 2272 CPU_BASED_CR3_STORE_EXITING | 2273 CPU_BASED_INVLPG_EXITING); 2274 } else if (vmx_cap->ept) { 2275 vmx_cap->ept = 0; 2276 pr_warn_once("EPT CAP should not exist if not support " 2277 "1-setting enable EPT VM-execution control\n"); 2278 } 2279 if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) && 2280 vmx_cap->vpid) { 2281 vmx_cap->vpid = 0; 2282 pr_warn_once("VPID CAP should not exist if not support " 2283 "1-setting enable VPID VM-execution control\n"); 2284 } 2285 2286 min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; 2287 #ifdef CONFIG_X86_64 2288 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 2289 #endif 2290 opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | 2291 VM_EXIT_SAVE_IA32_PAT | 2292 VM_EXIT_LOAD_IA32_PAT | 2293 VM_EXIT_LOAD_IA32_EFER | 2294 VM_EXIT_CLEAR_BNDCFGS | 2295 VM_EXIT_PT_CONCEAL_PIP | 2296 VM_EXIT_CLEAR_IA32_RTIT_CTL; 2297 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 2298 &_vmexit_control) < 0) 2299 return -EIO; 2300 2301 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 2302 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | 2303 PIN_BASED_VMX_PREEMPTION_TIMER; 2304 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 2305 &_pin_based_exec_control) < 0) 2306 return -EIO; 2307 2308 if (cpu_has_broken_vmx_preemption_timer()) 2309 _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 2310 if (!(_cpu_based_2nd_exec_control & 2311 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) 2312 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 2313 2314 min = VM_ENTRY_LOAD_DEBUG_CONTROLS; 2315 opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | 2316 VM_ENTRY_LOAD_IA32_PAT | 2317 VM_ENTRY_LOAD_IA32_EFER | 2318 VM_ENTRY_LOAD_BNDCFGS | 2319 VM_ENTRY_PT_CONCEAL_PIP | 2320 VM_ENTRY_LOAD_IA32_RTIT_CTL; 2321 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 2322 &_vmentry_control) < 0) 2323 return -EIO; 2324 2325 /* 2326 * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they 2327 * can't be used due to an errata where VM Exit may incorrectly clear 2328 * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the 2329 * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL. 2330 */ 2331 if (boot_cpu_data.x86 == 0x6) { 2332 switch (boot_cpu_data.x86_model) { 2333 case 26: /* AAK155 */ 2334 case 30: /* AAP115 */ 2335 case 37: /* AAT100 */ 2336 case 44: /* BC86,AAY89,BD102 */ 2337 case 46: /* BA97 */ 2338 _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; 2339 _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 2340 pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 2341 "does not work properly. Using workaround\n"); 2342 break; 2343 default: 2344 break; 2345 } 2346 } 2347 2348 2349 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 2350 2351 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 2352 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 2353 return -EIO; 2354 2355 #ifdef CONFIG_X86_64 2356 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 2357 if (vmx_msr_high & (1u<<16)) 2358 return -EIO; 2359 #endif 2360 2361 /* Require Write-Back (WB) memory type for VMCS accesses. */ 2362 if (((vmx_msr_high >> 18) & 15) != 6) 2363 return -EIO; 2364 2365 vmcs_conf->size = vmx_msr_high & 0x1fff; 2366 vmcs_conf->order = get_order(vmcs_conf->size); 2367 vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; 2368 2369 vmcs_conf->revision_id = vmx_msr_low; 2370 2371 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 2372 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 2373 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 2374 vmcs_conf->vmexit_ctrl = _vmexit_control; 2375 vmcs_conf->vmentry_ctrl = _vmentry_control; 2376 2377 if (static_branch_unlikely(&enable_evmcs)) 2378 evmcs_sanitize_exec_ctrls(vmcs_conf); 2379 2380 return 0; 2381 } 2382 2383 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags) 2384 { 2385 int node = cpu_to_node(cpu); 2386 struct page *pages; 2387 struct vmcs *vmcs; 2388 2389 pages = __alloc_pages_node(node, flags, vmcs_config.order); 2390 if (!pages) 2391 return NULL; 2392 vmcs = page_address(pages); 2393 memset(vmcs, 0, vmcs_config.size); 2394 2395 /* KVM supports Enlightened VMCS v1 only */ 2396 if (static_branch_unlikely(&enable_evmcs)) 2397 vmcs->hdr.revision_id = KVM_EVMCS_VERSION; 2398 else 2399 vmcs->hdr.revision_id = vmcs_config.revision_id; 2400 2401 if (shadow) 2402 vmcs->hdr.shadow_vmcs = 1; 2403 return vmcs; 2404 } 2405 2406 void free_vmcs(struct vmcs *vmcs) 2407 { 2408 free_pages((unsigned long)vmcs, vmcs_config.order); 2409 } 2410 2411 /* 2412 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 2413 */ 2414 void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2415 { 2416 if (!loaded_vmcs->vmcs) 2417 return; 2418 loaded_vmcs_clear(loaded_vmcs); 2419 free_vmcs(loaded_vmcs->vmcs); 2420 loaded_vmcs->vmcs = NULL; 2421 if (loaded_vmcs->msr_bitmap) 2422 free_page((unsigned long)loaded_vmcs->msr_bitmap); 2423 WARN_ON(loaded_vmcs->shadow_vmcs != NULL); 2424 } 2425 2426 int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 2427 { 2428 loaded_vmcs->vmcs = alloc_vmcs(false); 2429 if (!loaded_vmcs->vmcs) 2430 return -ENOMEM; 2431 2432 loaded_vmcs->shadow_vmcs = NULL; 2433 loaded_vmcs_init(loaded_vmcs); 2434 2435 if (cpu_has_vmx_msr_bitmap()) { 2436 loaded_vmcs->msr_bitmap = (unsigned long *) 2437 __get_free_page(GFP_KERNEL_ACCOUNT); 2438 if (!loaded_vmcs->msr_bitmap) 2439 goto out_vmcs; 2440 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); 2441 2442 if (IS_ENABLED(CONFIG_HYPERV) && 2443 static_branch_unlikely(&enable_evmcs) && 2444 (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { 2445 struct hv_enlightened_vmcs *evmcs = 2446 (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; 2447 2448 evmcs->hv_enlightenments_control.msr_bitmap = 1; 2449 } 2450 } 2451 2452 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); 2453 2454 return 0; 2455 2456 out_vmcs: 2457 free_loaded_vmcs(loaded_vmcs); 2458 return -ENOMEM; 2459 } 2460 2461 static void free_kvm_area(void) 2462 { 2463 int cpu; 2464 2465 for_each_possible_cpu(cpu) { 2466 free_vmcs(per_cpu(vmxarea, cpu)); 2467 per_cpu(vmxarea, cpu) = NULL; 2468 } 2469 } 2470 2471 static __init int alloc_kvm_area(void) 2472 { 2473 int cpu; 2474 2475 for_each_possible_cpu(cpu) { 2476 struct vmcs *vmcs; 2477 2478 vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL); 2479 if (!vmcs) { 2480 free_kvm_area(); 2481 return -ENOMEM; 2482 } 2483 2484 /* 2485 * When eVMCS is enabled, alloc_vmcs_cpu() sets 2486 * vmcs->revision_id to KVM_EVMCS_VERSION instead of 2487 * revision_id reported by MSR_IA32_VMX_BASIC. 2488 * 2489 * However, even though not explicitly documented by 2490 * TLFS, VMXArea passed as VMXON argument should 2491 * still be marked with revision_id reported by 2492 * physical CPU. 2493 */ 2494 if (static_branch_unlikely(&enable_evmcs)) 2495 vmcs->hdr.revision_id = vmcs_config.revision_id; 2496 2497 per_cpu(vmxarea, cpu) = vmcs; 2498 } 2499 return 0; 2500 } 2501 2502 static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 2503 struct kvm_segment *save) 2504 { 2505 if (!emulate_invalid_guest_state) { 2506 /* 2507 * CS and SS RPL should be equal during guest entry according 2508 * to VMX spec, but in reality it is not always so. Since vcpu 2509 * is in the middle of the transition from real mode to 2510 * protected mode it is safe to assume that RPL 0 is a good 2511 * default value. 2512 */ 2513 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 2514 save->selector &= ~SEGMENT_RPL_MASK; 2515 save->dpl = save->selector & SEGMENT_RPL_MASK; 2516 save->s = 1; 2517 } 2518 vmx_set_segment(vcpu, save, seg); 2519 } 2520 2521 static void enter_pmode(struct kvm_vcpu *vcpu) 2522 { 2523 unsigned long flags; 2524 struct vcpu_vmx *vmx = to_vmx(vcpu); 2525 2526 /* 2527 * Update real mode segment cache. It may be not up-to-date if sement 2528 * register was written while vcpu was in a guest mode. 2529 */ 2530 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2531 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 2532 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 2533 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 2534 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 2535 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 2536 2537 vmx->rmode.vm86_active = 0; 2538 2539 vmx_segment_cache_clear(vmx); 2540 2541 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2542 2543 flags = vmcs_readl(GUEST_RFLAGS); 2544 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 2545 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 2546 vmcs_writel(GUEST_RFLAGS, flags); 2547 2548 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 2549 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 2550 2551 update_exception_bitmap(vcpu); 2552 2553 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 2554 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 2555 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 2556 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 2557 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 2558 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 2559 } 2560 2561 static void fix_rmode_seg(int seg, struct kvm_segment *save) 2562 { 2563 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2564 struct kvm_segment var = *save; 2565 2566 var.dpl = 0x3; 2567 if (seg == VCPU_SREG_CS) 2568 var.type = 0x3; 2569 2570 if (!emulate_invalid_guest_state) { 2571 var.selector = var.base >> 4; 2572 var.base = var.base & 0xffff0; 2573 var.limit = 0xffff; 2574 var.g = 0; 2575 var.db = 0; 2576 var.present = 1; 2577 var.s = 1; 2578 var.l = 0; 2579 var.unusable = 0; 2580 var.type = 0x3; 2581 var.avl = 0; 2582 if (save->base & 0xf) 2583 printk_once(KERN_WARNING "kvm: segment base is not " 2584 "paragraph aligned when entering " 2585 "protected mode (seg=%d)", seg); 2586 } 2587 2588 vmcs_write16(sf->selector, var.selector); 2589 vmcs_writel(sf->base, var.base); 2590 vmcs_write32(sf->limit, var.limit); 2591 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 2592 } 2593 2594 static void enter_rmode(struct kvm_vcpu *vcpu) 2595 { 2596 unsigned long flags; 2597 struct vcpu_vmx *vmx = to_vmx(vcpu); 2598 struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm); 2599 2600 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 2601 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 2602 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 2603 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 2604 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 2605 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 2606 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 2607 2608 vmx->rmode.vm86_active = 1; 2609 2610 /* 2611 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 2612 * vcpu. Warn the user that an update is overdue. 2613 */ 2614 if (!kvm_vmx->tss_addr) 2615 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 2616 "called before entering vcpu\n"); 2617 2618 vmx_segment_cache_clear(vmx); 2619 2620 vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr); 2621 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 2622 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 2623 2624 flags = vmcs_readl(GUEST_RFLAGS); 2625 vmx->rmode.save_rflags = flags; 2626 2627 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 2628 2629 vmcs_writel(GUEST_RFLAGS, flags); 2630 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 2631 update_exception_bitmap(vcpu); 2632 2633 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 2634 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 2635 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 2636 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 2637 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 2638 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 2639 2640 kvm_mmu_reset_context(vcpu); 2641 } 2642 2643 void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 2644 { 2645 struct vcpu_vmx *vmx = to_vmx(vcpu); 2646 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 2647 2648 if (!msr) 2649 return; 2650 2651 vcpu->arch.efer = efer; 2652 if (efer & EFER_LMA) { 2653 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 2654 msr->data = efer; 2655 } else { 2656 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 2657 2658 msr->data = efer & ~EFER_LME; 2659 } 2660 setup_msrs(vmx); 2661 } 2662 2663 #ifdef CONFIG_X86_64 2664 2665 static void enter_lmode(struct kvm_vcpu *vcpu) 2666 { 2667 u32 guest_tr_ar; 2668 2669 vmx_segment_cache_clear(to_vmx(vcpu)); 2670 2671 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 2672 if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) { 2673 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 2674 __func__); 2675 vmcs_write32(GUEST_TR_AR_BYTES, 2676 (guest_tr_ar & ~VMX_AR_TYPE_MASK) 2677 | VMX_AR_TYPE_BUSY_64_TSS); 2678 } 2679 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 2680 } 2681 2682 static void exit_lmode(struct kvm_vcpu *vcpu) 2683 { 2684 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 2685 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 2686 } 2687 2688 #endif 2689 2690 static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) 2691 { 2692 int vpid = to_vmx(vcpu)->vpid; 2693 2694 if (!vpid_sync_vcpu_addr(vpid, addr)) 2695 vpid_sync_context(vpid); 2696 2697 /* 2698 * If VPIDs are not supported or enabled, then the above is a no-op. 2699 * But we don't really need a TLB flush in that case anyway, because 2700 * each VM entry/exit includes an implicit flush when VPID is 0. 2701 */ 2702 } 2703 2704 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 2705 { 2706 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 2707 2708 vcpu->arch.cr0 &= ~cr0_guest_owned_bits; 2709 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 2710 } 2711 2712 static void vmx_decache_cr3(struct kvm_vcpu *vcpu) 2713 { 2714 if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) 2715 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 2716 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 2717 } 2718 2719 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 2720 { 2721 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2722 2723 vcpu->arch.cr4 &= ~cr4_guest_owned_bits; 2724 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; 2725 } 2726 2727 static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 2728 { 2729 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 2730 2731 if (!test_bit(VCPU_EXREG_PDPTR, 2732 (unsigned long *)&vcpu->arch.regs_dirty)) 2733 return; 2734 2735 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 2736 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 2737 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 2738 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 2739 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 2740 } 2741 } 2742 2743 void ept_save_pdptrs(struct kvm_vcpu *vcpu) 2744 { 2745 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 2746 2747 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 2748 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 2749 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 2750 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 2751 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 2752 } 2753 2754 __set_bit(VCPU_EXREG_PDPTR, 2755 (unsigned long *)&vcpu->arch.regs_avail); 2756 __set_bit(VCPU_EXREG_PDPTR, 2757 (unsigned long *)&vcpu->arch.regs_dirty); 2758 } 2759 2760 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 2761 unsigned long cr0, 2762 struct kvm_vcpu *vcpu) 2763 { 2764 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) 2765 vmx_decache_cr3(vcpu); 2766 if (!(cr0 & X86_CR0_PG)) { 2767 /* From paging/starting to nonpaging */ 2768 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 2769 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | 2770 (CPU_BASED_CR3_LOAD_EXITING | 2771 CPU_BASED_CR3_STORE_EXITING)); 2772 vcpu->arch.cr0 = cr0; 2773 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 2774 } else if (!is_paging(vcpu)) { 2775 /* From nonpaging to paging */ 2776 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 2777 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & 2778 ~(CPU_BASED_CR3_LOAD_EXITING | 2779 CPU_BASED_CR3_STORE_EXITING)); 2780 vcpu->arch.cr0 = cr0; 2781 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 2782 } 2783 2784 if (!(cr0 & X86_CR0_WP)) 2785 *hw_cr0 &= ~X86_CR0_WP; 2786 } 2787 2788 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 2789 { 2790 struct vcpu_vmx *vmx = to_vmx(vcpu); 2791 unsigned long hw_cr0; 2792 2793 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); 2794 if (enable_unrestricted_guest) 2795 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 2796 else { 2797 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 2798 2799 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 2800 enter_pmode(vcpu); 2801 2802 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 2803 enter_rmode(vcpu); 2804 } 2805 2806 #ifdef CONFIG_X86_64 2807 if (vcpu->arch.efer & EFER_LME) { 2808 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 2809 enter_lmode(vcpu); 2810 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 2811 exit_lmode(vcpu); 2812 } 2813 #endif 2814 2815 if (enable_ept && !enable_unrestricted_guest) 2816 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 2817 2818 vmcs_writel(CR0_READ_SHADOW, cr0); 2819 vmcs_writel(GUEST_CR0, hw_cr0); 2820 vcpu->arch.cr0 = cr0; 2821 2822 /* depends on vcpu->arch.cr0 to be set to a new value */ 2823 vmx->emulation_required = emulation_required(vcpu); 2824 } 2825 2826 static int get_ept_level(struct kvm_vcpu *vcpu) 2827 { 2828 if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) 2829 return 5; 2830 return 4; 2831 } 2832 2833 u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) 2834 { 2835 u64 eptp = VMX_EPTP_MT_WB; 2836 2837 eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; 2838 2839 if (enable_ept_ad_bits && 2840 (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) 2841 eptp |= VMX_EPTP_AD_ENABLE_BIT; 2842 eptp |= (root_hpa & PAGE_MASK); 2843 2844 return eptp; 2845 } 2846 2847 void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 2848 { 2849 struct kvm *kvm = vcpu->kvm; 2850 unsigned long guest_cr3; 2851 u64 eptp; 2852 2853 guest_cr3 = cr3; 2854 if (enable_ept) { 2855 eptp = construct_eptp(vcpu, cr3); 2856 vmcs_write64(EPT_POINTER, eptp); 2857 2858 if (kvm_x86_ops->tlb_remote_flush) { 2859 spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); 2860 to_vmx(vcpu)->ept_pointer = eptp; 2861 to_kvm_vmx(kvm)->ept_pointers_match 2862 = EPT_POINTERS_CHECK; 2863 spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); 2864 } 2865 2866 if (enable_unrestricted_guest || is_paging(vcpu) || 2867 is_guest_mode(vcpu)) 2868 guest_cr3 = kvm_read_cr3(vcpu); 2869 else 2870 guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; 2871 ept_load_pdptrs(vcpu); 2872 } 2873 2874 vmcs_writel(GUEST_CR3, guest_cr3); 2875 } 2876 2877 int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 2878 { 2879 /* 2880 * Pass through host's Machine Check Enable value to hw_cr4, which 2881 * is in force while we are in guest mode. Do not let guests control 2882 * this bit, even if host CR4.MCE == 0. 2883 */ 2884 unsigned long hw_cr4; 2885 2886 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); 2887 if (enable_unrestricted_guest) 2888 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; 2889 else if (to_vmx(vcpu)->rmode.vm86_active) 2890 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; 2891 else 2892 hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; 2893 2894 if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { 2895 if (cr4 & X86_CR4_UMIP) { 2896 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, 2897 SECONDARY_EXEC_DESC); 2898 hw_cr4 &= ~X86_CR4_UMIP; 2899 } else if (!is_guest_mode(vcpu) || 2900 !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) 2901 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, 2902 SECONDARY_EXEC_DESC); 2903 } 2904 2905 if (cr4 & X86_CR4_VMXE) { 2906 /* 2907 * To use VMXON (and later other VMX instructions), a guest 2908 * must first be able to turn on cr4.VMXE (see handle_vmon()). 2909 * So basically the check on whether to allow nested VMX 2910 * is here. We operate under the default treatment of SMM, 2911 * so VMX cannot be enabled under SMM. 2912 */ 2913 if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) 2914 return 1; 2915 } 2916 2917 if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) 2918 return 1; 2919 2920 vcpu->arch.cr4 = cr4; 2921 2922 if (!enable_unrestricted_guest) { 2923 if (enable_ept) { 2924 if (!is_paging(vcpu)) { 2925 hw_cr4 &= ~X86_CR4_PAE; 2926 hw_cr4 |= X86_CR4_PSE; 2927 } else if (!(cr4 & X86_CR4_PAE)) { 2928 hw_cr4 &= ~X86_CR4_PAE; 2929 } 2930 } 2931 2932 /* 2933 * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in 2934 * hardware. To emulate this behavior, SMEP/SMAP/PKU needs 2935 * to be manually disabled when guest switches to non-paging 2936 * mode. 2937 * 2938 * If !enable_unrestricted_guest, the CPU is always running 2939 * with CR0.PG=1 and CR4 needs to be modified. 2940 * If enable_unrestricted_guest, the CPU automatically 2941 * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. 2942 */ 2943 if (!is_paging(vcpu)) 2944 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 2945 } 2946 2947 vmcs_writel(CR4_READ_SHADOW, cr4); 2948 vmcs_writel(GUEST_CR4, hw_cr4); 2949 return 0; 2950 } 2951 2952 void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 2953 { 2954 struct vcpu_vmx *vmx = to_vmx(vcpu); 2955 u32 ar; 2956 2957 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 2958 *var = vmx->rmode.segs[seg]; 2959 if (seg == VCPU_SREG_TR 2960 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 2961 return; 2962 var->base = vmx_read_guest_seg_base(vmx, seg); 2963 var->selector = vmx_read_guest_seg_selector(vmx, seg); 2964 return; 2965 } 2966 var->base = vmx_read_guest_seg_base(vmx, seg); 2967 var->limit = vmx_read_guest_seg_limit(vmx, seg); 2968 var->selector = vmx_read_guest_seg_selector(vmx, seg); 2969 ar = vmx_read_guest_seg_ar(vmx, seg); 2970 var->unusable = (ar >> 16) & 1; 2971 var->type = ar & 15; 2972 var->s = (ar >> 4) & 1; 2973 var->dpl = (ar >> 5) & 3; 2974 /* 2975 * Some userspaces do not preserve unusable property. Since usable 2976 * segment has to be present according to VMX spec we can use present 2977 * property to amend userspace bug by making unusable segment always 2978 * nonpresent. vmx_segment_access_rights() already marks nonpresent 2979 * segment as unusable. 2980 */ 2981 var->present = !var->unusable; 2982 var->avl = (ar >> 12) & 1; 2983 var->l = (ar >> 13) & 1; 2984 var->db = (ar >> 14) & 1; 2985 var->g = (ar >> 15) & 1; 2986 } 2987 2988 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 2989 { 2990 struct kvm_segment s; 2991 2992 if (to_vmx(vcpu)->rmode.vm86_active) { 2993 vmx_get_segment(vcpu, &s, seg); 2994 return s.base; 2995 } 2996 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 2997 } 2998 2999 int vmx_get_cpl(struct kvm_vcpu *vcpu) 3000 { 3001 struct vcpu_vmx *vmx = to_vmx(vcpu); 3002 3003 if (unlikely(vmx->rmode.vm86_active)) 3004 return 0; 3005 else { 3006 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3007 return VMX_AR_DPL(ar); 3008 } 3009 } 3010 3011 static u32 vmx_segment_access_rights(struct kvm_segment *var) 3012 { 3013 u32 ar; 3014 3015 if (var->unusable || !var->present) 3016 ar = 1 << 16; 3017 else { 3018 ar = var->type & 15; 3019 ar |= (var->s & 1) << 4; 3020 ar |= (var->dpl & 3) << 5; 3021 ar |= (var->present & 1) << 7; 3022 ar |= (var->avl & 1) << 12; 3023 ar |= (var->l & 1) << 13; 3024 ar |= (var->db & 1) << 14; 3025 ar |= (var->g & 1) << 15; 3026 } 3027 3028 return ar; 3029 } 3030 3031 void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) 3032 { 3033 struct vcpu_vmx *vmx = to_vmx(vcpu); 3034 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3035 3036 vmx_segment_cache_clear(vmx); 3037 3038 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3039 vmx->rmode.segs[seg] = *var; 3040 if (seg == VCPU_SREG_TR) 3041 vmcs_write16(sf->selector, var->selector); 3042 else if (var->s) 3043 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3044 goto out; 3045 } 3046 3047 vmcs_writel(sf->base, var->base); 3048 vmcs_write32(sf->limit, var->limit); 3049 vmcs_write16(sf->selector, var->selector); 3050 3051 /* 3052 * Fix the "Accessed" bit in AR field of segment registers for older 3053 * qemu binaries. 3054 * IA32 arch specifies that at the time of processor reset the 3055 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3056 * is setting it to 0 in the userland code. This causes invalid guest 3057 * state vmexit when "unrestricted guest" mode is turned on. 3058 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3059 * tree. Newer qemu binaries with that qemu fix would not need this 3060 * kvm hack. 3061 */ 3062 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) 3063 var->type |= 0x1; /* Accessed */ 3064 3065 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3066 3067 out: 3068 vmx->emulation_required = emulation_required(vcpu); 3069 } 3070 3071 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3072 { 3073 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3074 3075 *db = (ar >> 14) & 1; 3076 *l = (ar >> 13) & 1; 3077 } 3078 3079 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3080 { 3081 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3082 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3083 } 3084 3085 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3086 { 3087 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3088 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3089 } 3090 3091 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3092 { 3093 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3094 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3095 } 3096 3097 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3098 { 3099 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3100 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3101 } 3102 3103 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3104 { 3105 struct kvm_segment var; 3106 u32 ar; 3107 3108 vmx_get_segment(vcpu, &var, seg); 3109 var.dpl = 0x3; 3110 if (seg == VCPU_SREG_CS) 3111 var.type = 0x3; 3112 ar = vmx_segment_access_rights(&var); 3113 3114 if (var.base != (var.selector << 4)) 3115 return false; 3116 if (var.limit != 0xffff) 3117 return false; 3118 if (ar != 0xf3) 3119 return false; 3120 3121 return true; 3122 } 3123 3124 static bool code_segment_valid(struct kvm_vcpu *vcpu) 3125 { 3126 struct kvm_segment cs; 3127 unsigned int cs_rpl; 3128 3129 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3130 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3131 3132 if (cs.unusable) 3133 return false; 3134 if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK)) 3135 return false; 3136 if (!cs.s) 3137 return false; 3138 if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) { 3139 if (cs.dpl > cs_rpl) 3140 return false; 3141 } else { 3142 if (cs.dpl != cs_rpl) 3143 return false; 3144 } 3145 if (!cs.present) 3146 return false; 3147 3148 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3149 return true; 3150 } 3151 3152 static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3153 { 3154 struct kvm_segment ss; 3155 unsigned int ss_rpl; 3156 3157 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3158 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3159 3160 if (ss.unusable) 3161 return true; 3162 if (ss.type != 3 && ss.type != 7) 3163 return false; 3164 if (!ss.s) 3165 return false; 3166 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3167 return false; 3168 if (!ss.present) 3169 return false; 3170 3171 return true; 3172 } 3173 3174 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3175 { 3176 struct kvm_segment var; 3177 unsigned int rpl; 3178 3179 vmx_get_segment(vcpu, &var, seg); 3180 rpl = var.selector & SEGMENT_RPL_MASK; 3181 3182 if (var.unusable) 3183 return true; 3184 if (!var.s) 3185 return false; 3186 if (!var.present) 3187 return false; 3188 if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) { 3189 if (var.dpl < rpl) /* DPL < RPL */ 3190 return false; 3191 } 3192 3193 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3194 * rights flags 3195 */ 3196 return true; 3197 } 3198 3199 static bool tr_valid(struct kvm_vcpu *vcpu) 3200 { 3201 struct kvm_segment tr; 3202 3203 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3204 3205 if (tr.unusable) 3206 return false; 3207 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3208 return false; 3209 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3210 return false; 3211 if (!tr.present) 3212 return false; 3213 3214 return true; 3215 } 3216 3217 static bool ldtr_valid(struct kvm_vcpu *vcpu) 3218 { 3219 struct kvm_segment ldtr; 3220 3221 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3222 3223 if (ldtr.unusable) 3224 return true; 3225 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3226 return false; 3227 if (ldtr.type != 2) 3228 return false; 3229 if (!ldtr.present) 3230 return false; 3231 3232 return true; 3233 } 3234 3235 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3236 { 3237 struct kvm_segment cs, ss; 3238 3239 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3240 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3241 3242 return ((cs.selector & SEGMENT_RPL_MASK) == 3243 (ss.selector & SEGMENT_RPL_MASK)); 3244 } 3245 3246 /* 3247 * Check if guest state is valid. Returns true if valid, false if 3248 * not. 3249 * We assume that registers are always usable 3250 */ 3251 static bool guest_state_valid(struct kvm_vcpu *vcpu) 3252 { 3253 if (enable_unrestricted_guest) 3254 return true; 3255 3256 /* real mode guest state checks */ 3257 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 3258 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 3259 return false; 3260 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 3261 return false; 3262 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 3263 return false; 3264 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 3265 return false; 3266 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 3267 return false; 3268 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 3269 return false; 3270 } else { 3271 /* protected mode guest state checks */ 3272 if (!cs_ss_rpl_check(vcpu)) 3273 return false; 3274 if (!code_segment_valid(vcpu)) 3275 return false; 3276 if (!stack_segment_valid(vcpu)) 3277 return false; 3278 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 3279 return false; 3280 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 3281 return false; 3282 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 3283 return false; 3284 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 3285 return false; 3286 if (!tr_valid(vcpu)) 3287 return false; 3288 if (!ldtr_valid(vcpu)) 3289 return false; 3290 } 3291 /* TODO: 3292 * - Add checks on RIP 3293 * - Add checks on RFLAGS 3294 */ 3295 3296 return true; 3297 } 3298 3299 static int init_rmode_tss(struct kvm *kvm) 3300 { 3301 gfn_t fn; 3302 u16 data = 0; 3303 int idx, r; 3304 3305 idx = srcu_read_lock(&kvm->srcu); 3306 fn = to_kvm_vmx(kvm)->tss_addr >> PAGE_SHIFT; 3307 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 3308 if (r < 0) 3309 goto out; 3310 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 3311 r = kvm_write_guest_page(kvm, fn++, &data, 3312 TSS_IOPB_BASE_OFFSET, sizeof(u16)); 3313 if (r < 0) 3314 goto out; 3315 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 3316 if (r < 0) 3317 goto out; 3318 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 3319 if (r < 0) 3320 goto out; 3321 data = ~0; 3322 r = kvm_write_guest_page(kvm, fn, &data, 3323 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, 3324 sizeof(u8)); 3325 out: 3326 srcu_read_unlock(&kvm->srcu, idx); 3327 return r; 3328 } 3329 3330 static int init_rmode_identity_map(struct kvm *kvm) 3331 { 3332 struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm); 3333 int i, idx, r = 0; 3334 kvm_pfn_t identity_map_pfn; 3335 u32 tmp; 3336 3337 /* Protect kvm_vmx->ept_identity_pagetable_done. */ 3338 mutex_lock(&kvm->slots_lock); 3339 3340 if (likely(kvm_vmx->ept_identity_pagetable_done)) 3341 goto out2; 3342 3343 if (!kvm_vmx->ept_identity_map_addr) 3344 kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; 3345 identity_map_pfn = kvm_vmx->ept_identity_map_addr >> PAGE_SHIFT; 3346 3347 r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 3348 kvm_vmx->ept_identity_map_addr, PAGE_SIZE); 3349 if (r < 0) 3350 goto out2; 3351 3352 idx = srcu_read_lock(&kvm->srcu); 3353 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 3354 if (r < 0) 3355 goto out; 3356 /* Set up identity-mapping pagetable for EPT in real mode */ 3357 for (i = 0; i < PT32_ENT_PER_PAGE; i++) { 3358 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 3359 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 3360 r = kvm_write_guest_page(kvm, identity_map_pfn, 3361 &tmp, i * sizeof(tmp), sizeof(tmp)); 3362 if (r < 0) 3363 goto out; 3364 } 3365 kvm_vmx->ept_identity_pagetable_done = true; 3366 3367 out: 3368 srcu_read_unlock(&kvm->srcu, idx); 3369 3370 out2: 3371 mutex_unlock(&kvm->slots_lock); 3372 return r; 3373 } 3374 3375 static void seg_setup(int seg) 3376 { 3377 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3378 unsigned int ar; 3379 3380 vmcs_write16(sf->selector, 0); 3381 vmcs_writel(sf->base, 0); 3382 vmcs_write32(sf->limit, 0xffff); 3383 ar = 0x93; 3384 if (seg == VCPU_SREG_CS) 3385 ar |= 0x08; /* code segment */ 3386 3387 vmcs_write32(sf->ar_bytes, ar); 3388 } 3389 3390 static int alloc_apic_access_page(struct kvm *kvm) 3391 { 3392 struct page *page; 3393 int r = 0; 3394 3395 mutex_lock(&kvm->slots_lock); 3396 if (kvm->arch.apic_access_page_done) 3397 goto out; 3398 r = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 3399 APIC_DEFAULT_PHYS_BASE, PAGE_SIZE); 3400 if (r) 3401 goto out; 3402 3403 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); 3404 if (is_error_page(page)) { 3405 r = -EFAULT; 3406 goto out; 3407 } 3408 3409 /* 3410 * Do not pin the page in memory, so that memory hot-unplug 3411 * is able to migrate it. 3412 */ 3413 put_page(page); 3414 kvm->arch.apic_access_page_done = true; 3415 out: 3416 mutex_unlock(&kvm->slots_lock); 3417 return r; 3418 } 3419 3420 int allocate_vpid(void) 3421 { 3422 int vpid; 3423 3424 if (!enable_vpid) 3425 return 0; 3426 spin_lock(&vmx_vpid_lock); 3427 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 3428 if (vpid < VMX_NR_VPIDS) 3429 __set_bit(vpid, vmx_vpid_bitmap); 3430 else 3431 vpid = 0; 3432 spin_unlock(&vmx_vpid_lock); 3433 return vpid; 3434 } 3435 3436 void free_vpid(int vpid) 3437 { 3438 if (!enable_vpid || vpid == 0) 3439 return; 3440 spin_lock(&vmx_vpid_lock); 3441 __clear_bit(vpid, vmx_vpid_bitmap); 3442 spin_unlock(&vmx_vpid_lock); 3443 } 3444 3445 static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, 3446 u32 msr, int type) 3447 { 3448 int f = sizeof(unsigned long); 3449 3450 if (!cpu_has_vmx_msr_bitmap()) 3451 return; 3452 3453 if (static_branch_unlikely(&enable_evmcs)) 3454 evmcs_touch_msr_bitmap(); 3455 3456 /* 3457 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 3458 * have the write-low and read-high bitmap offsets the wrong way round. 3459 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3460 */ 3461 if (msr <= 0x1fff) { 3462 if (type & MSR_TYPE_R) 3463 /* read-low */ 3464 __clear_bit(msr, msr_bitmap + 0x000 / f); 3465 3466 if (type & MSR_TYPE_W) 3467 /* write-low */ 3468 __clear_bit(msr, msr_bitmap + 0x800 / f); 3469 3470 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3471 msr &= 0x1fff; 3472 if (type & MSR_TYPE_R) 3473 /* read-high */ 3474 __clear_bit(msr, msr_bitmap + 0x400 / f); 3475 3476 if (type & MSR_TYPE_W) 3477 /* write-high */ 3478 __clear_bit(msr, msr_bitmap + 0xc00 / f); 3479 3480 } 3481 } 3482 3483 static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, 3484 u32 msr, int type) 3485 { 3486 int f = sizeof(unsigned long); 3487 3488 if (!cpu_has_vmx_msr_bitmap()) 3489 return; 3490 3491 if (static_branch_unlikely(&enable_evmcs)) 3492 evmcs_touch_msr_bitmap(); 3493 3494 /* 3495 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 3496 * have the write-low and read-high bitmap offsets the wrong way round. 3497 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 3498 */ 3499 if (msr <= 0x1fff) { 3500 if (type & MSR_TYPE_R) 3501 /* read-low */ 3502 __set_bit(msr, msr_bitmap + 0x000 / f); 3503 3504 if (type & MSR_TYPE_W) 3505 /* write-low */ 3506 __set_bit(msr, msr_bitmap + 0x800 / f); 3507 3508 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 3509 msr &= 0x1fff; 3510 if (type & MSR_TYPE_R) 3511 /* read-high */ 3512 __set_bit(msr, msr_bitmap + 0x400 / f); 3513 3514 if (type & MSR_TYPE_W) 3515 /* write-high */ 3516 __set_bit(msr, msr_bitmap + 0xc00 / f); 3517 3518 } 3519 } 3520 3521 static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap, 3522 u32 msr, int type, bool value) 3523 { 3524 if (value) 3525 vmx_enable_intercept_for_msr(msr_bitmap, msr, type); 3526 else 3527 vmx_disable_intercept_for_msr(msr_bitmap, msr, type); 3528 } 3529 3530 static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) 3531 { 3532 u8 mode = 0; 3533 3534 if (cpu_has_secondary_exec_ctrls() && 3535 (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & 3536 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { 3537 mode |= MSR_BITMAP_MODE_X2APIC; 3538 if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) 3539 mode |= MSR_BITMAP_MODE_X2APIC_APICV; 3540 } 3541 3542 return mode; 3543 } 3544 3545 static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap, 3546 u8 mode) 3547 { 3548 int msr; 3549 3550 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 3551 unsigned word = msr / BITS_PER_LONG; 3552 msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0; 3553 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 3554 } 3555 3556 if (mode & MSR_BITMAP_MODE_X2APIC) { 3557 /* 3558 * TPR reads and writes can be virtualized even if virtual interrupt 3559 * delivery is not in use. 3560 */ 3561 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW); 3562 if (mode & MSR_BITMAP_MODE_X2APIC_APICV) { 3563 vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R); 3564 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); 3565 vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); 3566 } 3567 } 3568 } 3569 3570 void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) 3571 { 3572 struct vcpu_vmx *vmx = to_vmx(vcpu); 3573 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3574 u8 mode = vmx_msr_bitmap_mode(vcpu); 3575 u8 changed = mode ^ vmx->msr_bitmap_mode; 3576 3577 if (!changed) 3578 return; 3579 3580 if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) 3581 vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); 3582 3583 vmx->msr_bitmap_mode = mode; 3584 } 3585 3586 void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) 3587 { 3588 unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; 3589 bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); 3590 u32 i; 3591 3592 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS, 3593 MSR_TYPE_RW, flag); 3594 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE, 3595 MSR_TYPE_RW, flag); 3596 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK, 3597 MSR_TYPE_RW, flag); 3598 vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH, 3599 MSR_TYPE_RW, flag); 3600 for (i = 0; i < vmx->pt_desc.addr_range; i++) { 3601 vmx_set_intercept_for_msr(msr_bitmap, 3602 MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag); 3603 vmx_set_intercept_for_msr(msr_bitmap, 3604 MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag); 3605 } 3606 } 3607 3608 static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) 3609 { 3610 return enable_apicv; 3611 } 3612 3613 static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) 3614 { 3615 struct vcpu_vmx *vmx = to_vmx(vcpu); 3616 void *vapic_page; 3617 u32 vppr; 3618 int rvi; 3619 3620 if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || 3621 !nested_cpu_has_vid(get_vmcs12(vcpu)) || 3622 WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) 3623 return false; 3624 3625 rvi = vmx_get_rvi(); 3626 3627 vapic_page = kmap(vmx->nested.virtual_apic_page); 3628 vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); 3629 kunmap(vmx->nested.virtual_apic_page); 3630 3631 return ((rvi & 0xf0) > (vppr & 0xf0)); 3632 } 3633 3634 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, 3635 bool nested) 3636 { 3637 #ifdef CONFIG_SMP 3638 int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR; 3639 3640 if (vcpu->mode == IN_GUEST_MODE) { 3641 /* 3642 * The vector of interrupt to be delivered to vcpu had 3643 * been set in PIR before this function. 3644 * 3645 * Following cases will be reached in this block, and 3646 * we always send a notification event in all cases as 3647 * explained below. 3648 * 3649 * Case 1: vcpu keeps in non-root mode. Sending a 3650 * notification event posts the interrupt to vcpu. 3651 * 3652 * Case 2: vcpu exits to root mode and is still 3653 * runnable. PIR will be synced to vIRR before the 3654 * next vcpu entry. Sending a notification event in 3655 * this case has no effect, as vcpu is not in root 3656 * mode. 3657 * 3658 * Case 3: vcpu exits to root mode and is blocked. 3659 * vcpu_block() has already synced PIR to vIRR and 3660 * never blocks vcpu if vIRR is not cleared. Therefore, 3661 * a blocked vcpu here does not wait for any requested 3662 * interrupts in PIR, and sending a notification event 3663 * which has no effect is safe here. 3664 */ 3665 3666 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec); 3667 return true; 3668 } 3669 #endif 3670 return false; 3671 } 3672 3673 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 3674 int vector) 3675 { 3676 struct vcpu_vmx *vmx = to_vmx(vcpu); 3677 3678 if (is_guest_mode(vcpu) && 3679 vector == vmx->nested.posted_intr_nv) { 3680 /* 3681 * If a posted intr is not recognized by hardware, 3682 * we will accomplish it in the next vmentry. 3683 */ 3684 vmx->nested.pi_pending = true; 3685 kvm_make_request(KVM_REQ_EVENT, vcpu); 3686 /* the PIR and ON have been set by L1. */ 3687 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) 3688 kvm_vcpu_kick(vcpu); 3689 return 0; 3690 } 3691 return -1; 3692 } 3693 /* 3694 * Send interrupt to vcpu via posted interrupt way. 3695 * 1. If target vcpu is running(non-root mode), send posted interrupt 3696 * notification to vcpu and hardware will sync PIR to vIRR atomically. 3697 * 2. If target vcpu isn't running(root mode), kick it to pick up the 3698 * interrupt from PIR in next vmentry. 3699 */ 3700 static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 3701 { 3702 struct vcpu_vmx *vmx = to_vmx(vcpu); 3703 int r; 3704 3705 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 3706 if (!r) 3707 return; 3708 3709 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 3710 return; 3711 3712 /* If a previous notification has sent the IPI, nothing to do. */ 3713 if (pi_test_and_set_on(&vmx->pi_desc)) 3714 return; 3715 3716 if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false)) 3717 kvm_vcpu_kick(vcpu); 3718 } 3719 3720 /* 3721 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 3722 * will not change in the lifetime of the guest. 3723 * Note that host-state that does change is set elsewhere. E.g., host-state 3724 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 3725 */ 3726 void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 3727 { 3728 u32 low32, high32; 3729 unsigned long tmpl; 3730 struct desc_ptr dt; 3731 unsigned long cr0, cr3, cr4; 3732 3733 cr0 = read_cr0(); 3734 WARN_ON(cr0 & X86_CR0_TS); 3735 vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ 3736 3737 /* 3738 * Save the most likely value for this task's CR3 in the VMCS. 3739 * We can't use __get_current_cr3_fast() because we're not atomic. 3740 */ 3741 cr3 = __read_cr3(); 3742 vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ 3743 vmx->loaded_vmcs->host_state.cr3 = cr3; 3744 3745 /* Save the most likely value for this task's CR4 in the VMCS. */ 3746 cr4 = cr4_read_shadow(); 3747 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 3748 vmx->loaded_vmcs->host_state.cr4 = cr4; 3749 3750 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 3751 #ifdef CONFIG_X86_64 3752 /* 3753 * Load null selectors, so we can avoid reloading them in 3754 * vmx_prepare_switch_to_host(), in case userspace uses 3755 * the null selectors too (the expected case). 3756 */ 3757 vmcs_write16(HOST_DS_SELECTOR, 0); 3758 vmcs_write16(HOST_ES_SELECTOR, 0); 3759 #else 3760 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3761 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3762 #endif 3763 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 3764 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 3765 3766 store_idt(&dt); 3767 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 3768 vmx->host_idt_base = dt.address; 3769 3770 vmcs_writel(HOST_RIP, (unsigned long)vmx_vmexit); /* 22.2.5 */ 3771 3772 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 3773 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 3774 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 3775 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 3776 3777 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 3778 rdmsr(MSR_IA32_CR_PAT, low32, high32); 3779 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 3780 } 3781 3782 if (cpu_has_load_ia32_efer()) 3783 vmcs_write64(HOST_IA32_EFER, host_efer); 3784 } 3785 3786 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 3787 { 3788 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 3789 if (enable_ept) 3790 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 3791 if (is_guest_mode(&vmx->vcpu)) 3792 vmx->vcpu.arch.cr4_guest_owned_bits &= 3793 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; 3794 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 3795 } 3796 3797 static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 3798 { 3799 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 3800 3801 if (!kvm_vcpu_apicv_active(&vmx->vcpu)) 3802 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 3803 3804 if (!enable_vnmi) 3805 pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS; 3806 3807 /* Enable the preemption timer dynamically */ 3808 pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 3809 return pin_based_exec_ctrl; 3810 } 3811 3812 static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) 3813 { 3814 struct vcpu_vmx *vmx = to_vmx(vcpu); 3815 3816 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 3817 if (cpu_has_secondary_exec_ctrls()) { 3818 if (kvm_vcpu_apicv_active(vcpu)) 3819 vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, 3820 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3821 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3822 else 3823 vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, 3824 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3825 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3826 } 3827 3828 if (cpu_has_vmx_msr_bitmap()) 3829 vmx_update_msr_bitmap(vcpu); 3830 } 3831 3832 u32 vmx_exec_control(struct vcpu_vmx *vmx) 3833 { 3834 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 3835 3836 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 3837 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 3838 3839 if (!cpu_need_tpr_shadow(&vmx->vcpu)) { 3840 exec_control &= ~CPU_BASED_TPR_SHADOW; 3841 #ifdef CONFIG_X86_64 3842 exec_control |= CPU_BASED_CR8_STORE_EXITING | 3843 CPU_BASED_CR8_LOAD_EXITING; 3844 #endif 3845 } 3846 if (!enable_ept) 3847 exec_control |= CPU_BASED_CR3_STORE_EXITING | 3848 CPU_BASED_CR3_LOAD_EXITING | 3849 CPU_BASED_INVLPG_EXITING; 3850 if (kvm_mwait_in_guest(vmx->vcpu.kvm)) 3851 exec_control &= ~(CPU_BASED_MWAIT_EXITING | 3852 CPU_BASED_MONITOR_EXITING); 3853 if (kvm_hlt_in_guest(vmx->vcpu.kvm)) 3854 exec_control &= ~CPU_BASED_HLT_EXITING; 3855 return exec_control; 3856 } 3857 3858 3859 static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) 3860 { 3861 struct kvm_vcpu *vcpu = &vmx->vcpu; 3862 3863 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 3864 3865 if (pt_mode == PT_MODE_SYSTEM) 3866 exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); 3867 if (!cpu_need_virtualize_apic_accesses(vcpu)) 3868 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 3869 if (vmx->vpid == 0) 3870 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 3871 if (!enable_ept) { 3872 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 3873 enable_unrestricted_guest = 0; 3874 } 3875 if (!enable_unrestricted_guest) 3876 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 3877 if (kvm_pause_in_guest(vmx->vcpu.kvm)) 3878 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 3879 if (!kvm_vcpu_apicv_active(vcpu)) 3880 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 3881 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3882 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 3883 3884 /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, 3885 * in vmx_set_cr4. */ 3886 exec_control &= ~SECONDARY_EXEC_DESC; 3887 3888 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 3889 (handle_vmptrld). 3890 We can NOT enable shadow_vmcs here because we don't have yet 3891 a current VMCS12 3892 */ 3893 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 3894 3895 if (!enable_pml) 3896 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 3897 3898 if (vmx_xsaves_supported()) { 3899 /* Exposing XSAVES only when XSAVE is exposed */ 3900 bool xsaves_enabled = 3901 guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 3902 guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); 3903 3904 if (!xsaves_enabled) 3905 exec_control &= ~SECONDARY_EXEC_XSAVES; 3906 3907 if (nested) { 3908 if (xsaves_enabled) 3909 vmx->nested.msrs.secondary_ctls_high |= 3910 SECONDARY_EXEC_XSAVES; 3911 else 3912 vmx->nested.msrs.secondary_ctls_high &= 3913 ~SECONDARY_EXEC_XSAVES; 3914 } 3915 } 3916 3917 if (vmx_rdtscp_supported()) { 3918 bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); 3919 if (!rdtscp_enabled) 3920 exec_control &= ~SECONDARY_EXEC_RDTSCP; 3921 3922 if (nested) { 3923 if (rdtscp_enabled) 3924 vmx->nested.msrs.secondary_ctls_high |= 3925 SECONDARY_EXEC_RDTSCP; 3926 else 3927 vmx->nested.msrs.secondary_ctls_high &= 3928 ~SECONDARY_EXEC_RDTSCP; 3929 } 3930 } 3931 3932 if (vmx_invpcid_supported()) { 3933 /* Exposing INVPCID only when PCID is exposed */ 3934 bool invpcid_enabled = 3935 guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && 3936 guest_cpuid_has(vcpu, X86_FEATURE_PCID); 3937 3938 if (!invpcid_enabled) { 3939 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 3940 guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID); 3941 } 3942 3943 if (nested) { 3944 if (invpcid_enabled) 3945 vmx->nested.msrs.secondary_ctls_high |= 3946 SECONDARY_EXEC_ENABLE_INVPCID; 3947 else 3948 vmx->nested.msrs.secondary_ctls_high &= 3949 ~SECONDARY_EXEC_ENABLE_INVPCID; 3950 } 3951 } 3952 3953 if (vmx_rdrand_supported()) { 3954 bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND); 3955 if (rdrand_enabled) 3956 exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING; 3957 3958 if (nested) { 3959 if (rdrand_enabled) 3960 vmx->nested.msrs.secondary_ctls_high |= 3961 SECONDARY_EXEC_RDRAND_EXITING; 3962 else 3963 vmx->nested.msrs.secondary_ctls_high &= 3964 ~SECONDARY_EXEC_RDRAND_EXITING; 3965 } 3966 } 3967 3968 if (vmx_rdseed_supported()) { 3969 bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED); 3970 if (rdseed_enabled) 3971 exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING; 3972 3973 if (nested) { 3974 if (rdseed_enabled) 3975 vmx->nested.msrs.secondary_ctls_high |= 3976 SECONDARY_EXEC_RDSEED_EXITING; 3977 else 3978 vmx->nested.msrs.secondary_ctls_high &= 3979 ~SECONDARY_EXEC_RDSEED_EXITING; 3980 } 3981 } 3982 3983 vmx->secondary_exec_control = exec_control; 3984 } 3985 3986 static void ept_set_mmio_spte_mask(void) 3987 { 3988 /* 3989 * EPT Misconfigurations can be generated if the value of bits 2:0 3990 * of an EPT paging-structure entry is 110b (write/execute). 3991 */ 3992 kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK, 3993 VMX_EPT_MISCONFIG_WX_VALUE); 3994 } 3995 3996 #define VMX_XSS_EXIT_BITMAP 0 3997 3998 /* 3999 * Sets up the vmcs for emulated real mode. 4000 */ 4001 static void vmx_vcpu_setup(struct vcpu_vmx *vmx) 4002 { 4003 int i; 4004 4005 if (nested) 4006 nested_vmx_vcpu_setup(); 4007 4008 if (cpu_has_vmx_msr_bitmap()) 4009 vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); 4010 4011 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4012 4013 /* Control */ 4014 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 4015 vmx->hv_deadline_tsc = -1; 4016 4017 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4018 4019 if (cpu_has_secondary_exec_ctrls()) { 4020 vmx_compute_secondary_exec_control(vmx); 4021 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 4022 vmx->secondary_exec_control); 4023 } 4024 4025 if (kvm_vcpu_apicv_active(&vmx->vcpu)) { 4026 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4027 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4028 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4029 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4030 4031 vmcs_write16(GUEST_INTR_STATUS, 0); 4032 4033 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4034 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4035 } 4036 4037 if (!kvm_pause_in_guest(vmx->vcpu.kvm)) { 4038 vmcs_write32(PLE_GAP, ple_gap); 4039 vmx->ple_window = ple_window; 4040 vmx->ple_window_dirty = true; 4041 } 4042 4043 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4044 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4045 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4046 4047 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4048 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4049 vmx_set_constant_host_state(vmx); 4050 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4051 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4052 4053 if (cpu_has_vmx_vmfunc()) 4054 vmcs_write64(VM_FUNCTION_CONTROL, 0); 4055 4056 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4057 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4058 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 4059 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4060 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 4061 4062 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 4063 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 4064 4065 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { 4066 u32 index = vmx_msr_index[i]; 4067 u32 data_low, data_high; 4068 int j = vmx->nmsrs; 4069 4070 if (rdmsr_safe(index, &data_low, &data_high) < 0) 4071 continue; 4072 if (wrmsr_safe(index, data_low, data_high) < 0) 4073 continue; 4074 vmx->guest_msrs[j].index = i; 4075 vmx->guest_msrs[j].data = 0; 4076 vmx->guest_msrs[j].mask = -1ull; 4077 ++vmx->nmsrs; 4078 } 4079 4080 vm_exit_controls_init(vmx, vmx_vmexit_ctrl()); 4081 4082 /* 22.2.1, 20.8.1 */ 4083 vm_entry_controls_init(vmx, vmx_vmentry_ctrl()); 4084 4085 vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS; 4086 vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS); 4087 4088 set_cr4_guest_host_mask(vmx); 4089 4090 if (vmx_xsaves_supported()) 4091 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4092 4093 if (enable_pml) { 4094 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 4095 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 4096 } 4097 4098 if (cpu_has_vmx_encls_vmexit()) 4099 vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); 4100 4101 if (pt_mode == PT_MODE_HOST_GUEST) { 4102 memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); 4103 /* Bit[6~0] are forced to 1, writes are ignored. */ 4104 vmx->pt_desc.guest.output_mask = 0x7F; 4105 vmcs_write64(GUEST_IA32_RTIT_CTL, 0); 4106 } 4107 } 4108 4109 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 4110 { 4111 struct vcpu_vmx *vmx = to_vmx(vcpu); 4112 struct msr_data apic_base_msr; 4113 u64 cr0; 4114 4115 vmx->rmode.vm86_active = 0; 4116 vmx->spec_ctrl = 0; 4117 4118 vcpu->arch.microcode_version = 0x100000000ULL; 4119 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4120 kvm_set_cr8(vcpu, 0); 4121 4122 if (!init_event) { 4123 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | 4124 MSR_IA32_APICBASE_ENABLE; 4125 if (kvm_vcpu_is_reset_bsp(vcpu)) 4126 apic_base_msr.data |= MSR_IA32_APICBASE_BSP; 4127 apic_base_msr.host_initiated = true; 4128 kvm_set_apic_base(vcpu, &apic_base_msr); 4129 } 4130 4131 vmx_segment_cache_clear(vmx); 4132 4133 seg_setup(VCPU_SREG_CS); 4134 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4135 vmcs_writel(GUEST_CS_BASE, 0xffff0000ul); 4136 4137 seg_setup(VCPU_SREG_DS); 4138 seg_setup(VCPU_SREG_ES); 4139 seg_setup(VCPU_SREG_FS); 4140 seg_setup(VCPU_SREG_GS); 4141 seg_setup(VCPU_SREG_SS); 4142 4143 vmcs_write16(GUEST_TR_SELECTOR, 0); 4144 vmcs_writel(GUEST_TR_BASE, 0); 4145 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4146 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4147 4148 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4149 vmcs_writel(GUEST_LDTR_BASE, 0); 4150 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4151 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4152 4153 if (!init_event) { 4154 vmcs_write32(GUEST_SYSENTER_CS, 0); 4155 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4156 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4157 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4158 } 4159 4160 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); 4161 kvm_rip_write(vcpu, 0xfff0); 4162 4163 vmcs_writel(GUEST_GDTR_BASE, 0); 4164 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4165 4166 vmcs_writel(GUEST_IDTR_BASE, 0); 4167 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4168 4169 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4170 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4171 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4172 if (kvm_mpx_supported()) 4173 vmcs_write64(GUEST_BNDCFGS, 0); 4174 4175 setup_msrs(vmx); 4176 4177 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4178 4179 if (cpu_has_vmx_tpr_shadow() && !init_event) { 4180 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4181 if (cpu_need_tpr_shadow(vcpu)) 4182 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4183 __pa(vcpu->arch.apic->regs)); 4184 vmcs_write32(TPR_THRESHOLD, 0); 4185 } 4186 4187 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4188 4189 if (vmx->vpid != 0) 4190 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4191 4192 cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 4193 vmx->vcpu.arch.cr0 = cr0; 4194 vmx_set_cr0(vcpu, cr0); /* enter rmode */ 4195 vmx_set_cr4(vcpu, 0); 4196 vmx_set_efer(vcpu, 0); 4197 4198 update_exception_bitmap(vcpu); 4199 4200 vpid_sync_context(vmx->vpid); 4201 if (init_event) 4202 vmx_clear_hlt(vcpu); 4203 } 4204 4205 static void enable_irq_window(struct kvm_vcpu *vcpu) 4206 { 4207 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, 4208 CPU_BASED_VIRTUAL_INTR_PENDING); 4209 } 4210 4211 static void enable_nmi_window(struct kvm_vcpu *vcpu) 4212 { 4213 if (!enable_vnmi || 4214 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4215 enable_irq_window(vcpu); 4216 return; 4217 } 4218 4219 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, 4220 CPU_BASED_VIRTUAL_NMI_PENDING); 4221 } 4222 4223 static void vmx_inject_irq(struct kvm_vcpu *vcpu) 4224 { 4225 struct vcpu_vmx *vmx = to_vmx(vcpu); 4226 uint32_t intr; 4227 int irq = vcpu->arch.interrupt.nr; 4228 4229 trace_kvm_inj_virq(irq); 4230 4231 ++vcpu->stat.irq_injections; 4232 if (vmx->rmode.vm86_active) { 4233 int inc_eip = 0; 4234 if (vcpu->arch.interrupt.soft) 4235 inc_eip = vcpu->arch.event_exit_inst_len; 4236 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) 4237 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4238 return; 4239 } 4240 intr = irq | INTR_INFO_VALID_MASK; 4241 if (vcpu->arch.interrupt.soft) { 4242 intr |= INTR_TYPE_SOFT_INTR; 4243 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4244 vmx->vcpu.arch.event_exit_inst_len); 4245 } else 4246 intr |= INTR_TYPE_EXT_INTR; 4247 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4248 4249 vmx_clear_hlt(vcpu); 4250 } 4251 4252 static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4253 { 4254 struct vcpu_vmx *vmx = to_vmx(vcpu); 4255 4256 if (!enable_vnmi) { 4257 /* 4258 * Tracking the NMI-blocked state in software is built upon 4259 * finding the next open IRQ window. This, in turn, depends on 4260 * well-behaving guests: They have to keep IRQs disabled at 4261 * least as long as the NMI handler runs. Otherwise we may 4262 * cause NMI nesting, maybe breaking the guest. But as this is 4263 * highly unlikely, we can live with the residual risk. 4264 */ 4265 vmx->loaded_vmcs->soft_vnmi_blocked = 1; 4266 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4267 } 4268 4269 ++vcpu->stat.nmi_injections; 4270 vmx->loaded_vmcs->nmi_known_unmasked = false; 4271 4272 if (vmx->rmode.vm86_active) { 4273 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) 4274 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4275 return; 4276 } 4277 4278 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4279 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4280 4281 vmx_clear_hlt(vcpu); 4282 } 4283 4284 bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4285 { 4286 struct vcpu_vmx *vmx = to_vmx(vcpu); 4287 bool masked; 4288 4289 if (!enable_vnmi) 4290 return vmx->loaded_vmcs->soft_vnmi_blocked; 4291 if (vmx->loaded_vmcs->nmi_known_unmasked) 4292 return false; 4293 masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4294 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4295 return masked; 4296 } 4297 4298 void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4299 { 4300 struct vcpu_vmx *vmx = to_vmx(vcpu); 4301 4302 if (!enable_vnmi) { 4303 if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) { 4304 vmx->loaded_vmcs->soft_vnmi_blocked = masked; 4305 vmx->loaded_vmcs->vnmi_blocked_time = 0; 4306 } 4307 } else { 4308 vmx->loaded_vmcs->nmi_known_unmasked = !masked; 4309 if (masked) 4310 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4311 GUEST_INTR_STATE_NMI); 4312 else 4313 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4314 GUEST_INTR_STATE_NMI); 4315 } 4316 } 4317 4318 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4319 { 4320 if (to_vmx(vcpu)->nested.nested_run_pending) 4321 return 0; 4322 4323 if (!enable_vnmi && 4324 to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) 4325 return 0; 4326 4327 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4328 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI 4329 | GUEST_INTR_STATE_NMI)); 4330 } 4331 4332 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4333 { 4334 return (!to_vmx(vcpu)->nested.nested_run_pending && 4335 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 4336 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4337 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 4338 } 4339 4340 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 4341 { 4342 int ret; 4343 4344 if (enable_unrestricted_guest) 4345 return 0; 4346 4347 ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr, 4348 PAGE_SIZE * 3); 4349 if (ret) 4350 return ret; 4351 to_kvm_vmx(kvm)->tss_addr = addr; 4352 return init_rmode_tss(kvm); 4353 } 4354 4355 static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 4356 { 4357 to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr; 4358 return 0; 4359 } 4360 4361 static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 4362 { 4363 switch (vec) { 4364 case BP_VECTOR: 4365 /* 4366 * Update instruction length as we may reinject the exception 4367 * from user space while in guest debugging mode. 4368 */ 4369 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 4370 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4371 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 4372 return false; 4373 /* fall through */ 4374 case DB_VECTOR: 4375 if (vcpu->guest_debug & 4376 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 4377 return false; 4378 /* fall through */ 4379 case DE_VECTOR: 4380 case OF_VECTOR: 4381 case BR_VECTOR: 4382 case UD_VECTOR: 4383 case DF_VECTOR: 4384 case SS_VECTOR: 4385 case GP_VECTOR: 4386 case MF_VECTOR: 4387 return true; 4388 break; 4389 } 4390 return false; 4391 } 4392 4393 static int handle_rmode_exception(struct kvm_vcpu *vcpu, 4394 int vec, u32 err_code) 4395 { 4396 /* 4397 * Instruction with address size override prefix opcode 0x67 4398 * Cause the #SS fault with 0 error code in VM86 mode. 4399 */ 4400 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 4401 if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { 4402 if (vcpu->arch.halt_request) { 4403 vcpu->arch.halt_request = 0; 4404 return kvm_vcpu_halt(vcpu); 4405 } 4406 return 1; 4407 } 4408 return 0; 4409 } 4410 4411 /* 4412 * Forward all other exceptions that are valid in real mode. 4413 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 4414 * the required debugging infrastructure rework. 4415 */ 4416 kvm_queue_exception(vcpu, vec); 4417 return 1; 4418 } 4419 4420 /* 4421 * Trigger machine check on the host. We assume all the MSRs are already set up 4422 * by the CPU and that we still run on the same CPU as the MCE occurred on. 4423 * We pass a fake environment to the machine check handler because we want 4424 * the guest to be always treated like user space, no matter what context 4425 * it used internally. 4426 */ 4427 static void kvm_machine_check(void) 4428 { 4429 #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) 4430 struct pt_regs regs = { 4431 .cs = 3, /* Fake ring 3 no matter what the guest ran on */ 4432 .flags = X86_EFLAGS_IF, 4433 }; 4434 4435 do_machine_check(®s, 0); 4436 #endif 4437 } 4438 4439 static int handle_machine_check(struct kvm_vcpu *vcpu) 4440 { 4441 /* already handled by vcpu_run */ 4442 return 1; 4443 } 4444 4445 static int handle_exception(struct kvm_vcpu *vcpu) 4446 { 4447 struct vcpu_vmx *vmx = to_vmx(vcpu); 4448 struct kvm_run *kvm_run = vcpu->run; 4449 u32 intr_info, ex_no, error_code; 4450 unsigned long cr2, rip, dr6; 4451 u32 vect_info; 4452 enum emulation_result er; 4453 4454 vect_info = vmx->idt_vectoring_info; 4455 intr_info = vmx->exit_intr_info; 4456 4457 if (is_machine_check(intr_info)) 4458 return handle_machine_check(vcpu); 4459 4460 if (is_nmi(intr_info)) 4461 return 1; /* already handled by vmx_vcpu_run() */ 4462 4463 if (is_invalid_opcode(intr_info)) 4464 return handle_ud(vcpu); 4465 4466 error_code = 0; 4467 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 4468 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 4469 4470 if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { 4471 WARN_ON_ONCE(!enable_vmware_backdoor); 4472 er = kvm_emulate_instruction(vcpu, 4473 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); 4474 if (er == EMULATE_USER_EXIT) 4475 return 0; 4476 else if (er != EMULATE_DONE) 4477 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 4478 return 1; 4479 } 4480 4481 /* 4482 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 4483 * MMIO, it is better to report an internal error. 4484 * See the comments in vmx_handle_exit. 4485 */ 4486 if ((vect_info & VECTORING_INFO_VALID_MASK) && 4487 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 4488 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4489 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 4490 vcpu->run->internal.ndata = 3; 4491 vcpu->run->internal.data[0] = vect_info; 4492 vcpu->run->internal.data[1] = intr_info; 4493 vcpu->run->internal.data[2] = error_code; 4494 return 0; 4495 } 4496 4497 if (is_page_fault(intr_info)) { 4498 cr2 = vmcs_readl(EXIT_QUALIFICATION); 4499 /* EPT won't cause page fault directly */ 4500 WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); 4501 return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); 4502 } 4503 4504 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 4505 4506 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 4507 return handle_rmode_exception(vcpu, ex_no, error_code); 4508 4509 switch (ex_no) { 4510 case AC_VECTOR: 4511 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 4512 return 1; 4513 case DB_VECTOR: 4514 dr6 = vmcs_readl(EXIT_QUALIFICATION); 4515 if (!(vcpu->guest_debug & 4516 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 4517 vcpu->arch.dr6 &= ~15; 4518 vcpu->arch.dr6 |= dr6 | DR6_RTM; 4519 if (is_icebp(intr_info)) 4520 skip_emulated_instruction(vcpu); 4521 4522 kvm_queue_exception(vcpu, DB_VECTOR); 4523 return 1; 4524 } 4525 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; 4526 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 4527 /* fall through */ 4528 case BP_VECTOR: 4529 /* 4530 * Update instruction length as we may reinject #BP from 4531 * user space while in guest debugging mode. Reading it for 4532 * #DB as well causes no harm, it is not used in that case. 4533 */ 4534 vmx->vcpu.arch.event_exit_inst_len = 4535 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4536 kvm_run->exit_reason = KVM_EXIT_DEBUG; 4537 rip = kvm_rip_read(vcpu); 4538 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 4539 kvm_run->debug.arch.exception = ex_no; 4540 break; 4541 default: 4542 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 4543 kvm_run->ex.exception = ex_no; 4544 kvm_run->ex.error_code = error_code; 4545 break; 4546 } 4547 return 0; 4548 } 4549 4550 static int handle_external_interrupt(struct kvm_vcpu *vcpu) 4551 { 4552 ++vcpu->stat.irq_exits; 4553 return 1; 4554 } 4555 4556 static int handle_triple_fault(struct kvm_vcpu *vcpu) 4557 { 4558 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4559 vcpu->mmio_needed = 0; 4560 return 0; 4561 } 4562 4563 static int handle_io(struct kvm_vcpu *vcpu) 4564 { 4565 unsigned long exit_qualification; 4566 int size, in, string; 4567 unsigned port; 4568 4569 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4570 string = (exit_qualification & 16) != 0; 4571 4572 ++vcpu->stat.io_exits; 4573 4574 if (string) 4575 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; 4576 4577 port = exit_qualification >> 16; 4578 size = (exit_qualification & 7) + 1; 4579 in = (exit_qualification & 8) != 0; 4580 4581 return kvm_fast_pio(vcpu, size, port, in); 4582 } 4583 4584 static void 4585 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 4586 { 4587 /* 4588 * Patch in the VMCALL instruction: 4589 */ 4590 hypercall[0] = 0x0f; 4591 hypercall[1] = 0x01; 4592 hypercall[2] = 0xc1; 4593 } 4594 4595 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 4596 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 4597 { 4598 if (is_guest_mode(vcpu)) { 4599 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4600 unsigned long orig_val = val; 4601 4602 /* 4603 * We get here when L2 changed cr0 in a way that did not change 4604 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 4605 * but did change L0 shadowed bits. So we first calculate the 4606 * effective cr0 value that L1 would like to write into the 4607 * hardware. It consists of the L2-owned bits from the new 4608 * value combined with the L1-owned bits from L1's guest_cr0. 4609 */ 4610 val = (val & ~vmcs12->cr0_guest_host_mask) | 4611 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 4612 4613 if (!nested_guest_cr0_valid(vcpu, val)) 4614 return 1; 4615 4616 if (kvm_set_cr0(vcpu, val)) 4617 return 1; 4618 vmcs_writel(CR0_READ_SHADOW, orig_val); 4619 return 0; 4620 } else { 4621 if (to_vmx(vcpu)->nested.vmxon && 4622 !nested_host_cr0_valid(vcpu, val)) 4623 return 1; 4624 4625 return kvm_set_cr0(vcpu, val); 4626 } 4627 } 4628 4629 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 4630 { 4631 if (is_guest_mode(vcpu)) { 4632 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4633 unsigned long orig_val = val; 4634 4635 /* analogously to handle_set_cr0 */ 4636 val = (val & ~vmcs12->cr4_guest_host_mask) | 4637 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 4638 if (kvm_set_cr4(vcpu, val)) 4639 return 1; 4640 vmcs_writel(CR4_READ_SHADOW, orig_val); 4641 return 0; 4642 } else 4643 return kvm_set_cr4(vcpu, val); 4644 } 4645 4646 static int handle_desc(struct kvm_vcpu *vcpu) 4647 { 4648 WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); 4649 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; 4650 } 4651 4652 static int handle_cr(struct kvm_vcpu *vcpu) 4653 { 4654 unsigned long exit_qualification, val; 4655 int cr; 4656 int reg; 4657 int err; 4658 int ret; 4659 4660 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4661 cr = exit_qualification & 15; 4662 reg = (exit_qualification >> 8) & 15; 4663 switch ((exit_qualification >> 4) & 3) { 4664 case 0: /* mov to cr */ 4665 val = kvm_register_readl(vcpu, reg); 4666 trace_kvm_cr_write(cr, val); 4667 switch (cr) { 4668 case 0: 4669 err = handle_set_cr0(vcpu, val); 4670 return kvm_complete_insn_gp(vcpu, err); 4671 case 3: 4672 WARN_ON_ONCE(enable_unrestricted_guest); 4673 err = kvm_set_cr3(vcpu, val); 4674 return kvm_complete_insn_gp(vcpu, err); 4675 case 4: 4676 err = handle_set_cr4(vcpu, val); 4677 return kvm_complete_insn_gp(vcpu, err); 4678 case 8: { 4679 u8 cr8_prev = kvm_get_cr8(vcpu); 4680 u8 cr8 = (u8)val; 4681 err = kvm_set_cr8(vcpu, cr8); 4682 ret = kvm_complete_insn_gp(vcpu, err); 4683 if (lapic_in_kernel(vcpu)) 4684 return ret; 4685 if (cr8_prev <= cr8) 4686 return ret; 4687 /* 4688 * TODO: we might be squashing a 4689 * KVM_GUESTDBG_SINGLESTEP-triggered 4690 * KVM_EXIT_DEBUG here. 4691 */ 4692 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 4693 return 0; 4694 } 4695 } 4696 break; 4697 case 2: /* clts */ 4698 WARN_ONCE(1, "Guest should always own CR0.TS"); 4699 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4700 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 4701 return kvm_skip_emulated_instruction(vcpu); 4702 case 1: /*mov from cr*/ 4703 switch (cr) { 4704 case 3: 4705 WARN_ON_ONCE(enable_unrestricted_guest); 4706 val = kvm_read_cr3(vcpu); 4707 kvm_register_write(vcpu, reg, val); 4708 trace_kvm_cr_read(cr, val); 4709 return kvm_skip_emulated_instruction(vcpu); 4710 case 8: 4711 val = kvm_get_cr8(vcpu); 4712 kvm_register_write(vcpu, reg, val); 4713 trace_kvm_cr_read(cr, val); 4714 return kvm_skip_emulated_instruction(vcpu); 4715 } 4716 break; 4717 case 3: /* lmsw */ 4718 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 4719 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); 4720 kvm_lmsw(vcpu, val); 4721 4722 return kvm_skip_emulated_instruction(vcpu); 4723 default: 4724 break; 4725 } 4726 vcpu->run->exit_reason = 0; 4727 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 4728 (int)(exit_qualification >> 4) & 3, cr); 4729 return 0; 4730 } 4731 4732 static int handle_dr(struct kvm_vcpu *vcpu) 4733 { 4734 unsigned long exit_qualification; 4735 int dr, dr7, reg; 4736 4737 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4738 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 4739 4740 /* First, if DR does not exist, trigger UD */ 4741 if (!kvm_require_dr(vcpu, dr)) 4742 return 1; 4743 4744 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 4745 if (!kvm_require_cpl(vcpu, 0)) 4746 return 1; 4747 dr7 = vmcs_readl(GUEST_DR7); 4748 if (dr7 & DR7_GD) { 4749 /* 4750 * As the vm-exit takes precedence over the debug trap, we 4751 * need to emulate the latter, either for the host or the 4752 * guest debugging itself. 4753 */ 4754 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 4755 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; 4756 vcpu->run->debug.arch.dr7 = dr7; 4757 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 4758 vcpu->run->debug.arch.exception = DB_VECTOR; 4759 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 4760 return 0; 4761 } else { 4762 vcpu->arch.dr6 &= ~15; 4763 vcpu->arch.dr6 |= DR6_BD | DR6_RTM; 4764 kvm_queue_exception(vcpu, DB_VECTOR); 4765 return 1; 4766 } 4767 } 4768 4769 if (vcpu->guest_debug == 0) { 4770 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, 4771 CPU_BASED_MOV_DR_EXITING); 4772 4773 /* 4774 * No more DR vmexits; force a reload of the debug registers 4775 * and reenter on this instruction. The next vmexit will 4776 * retrieve the full state of the debug registers. 4777 */ 4778 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 4779 return 1; 4780 } 4781 4782 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 4783 if (exit_qualification & TYPE_MOV_FROM_DR) { 4784 unsigned long val; 4785 4786 if (kvm_get_dr(vcpu, dr, &val)) 4787 return 1; 4788 kvm_register_write(vcpu, reg, val); 4789 } else 4790 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) 4791 return 1; 4792 4793 return kvm_skip_emulated_instruction(vcpu); 4794 } 4795 4796 static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) 4797 { 4798 return vcpu->arch.dr6; 4799 } 4800 4801 static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 4802 { 4803 } 4804 4805 static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 4806 { 4807 get_debugreg(vcpu->arch.db[0], 0); 4808 get_debugreg(vcpu->arch.db[1], 1); 4809 get_debugreg(vcpu->arch.db[2], 2); 4810 get_debugreg(vcpu->arch.db[3], 3); 4811 get_debugreg(vcpu->arch.dr6, 6); 4812 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 4813 4814 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 4815 vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING); 4816 } 4817 4818 static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 4819 { 4820 vmcs_writel(GUEST_DR7, val); 4821 } 4822 4823 static int handle_cpuid(struct kvm_vcpu *vcpu) 4824 { 4825 return kvm_emulate_cpuid(vcpu); 4826 } 4827 4828 static int handle_rdmsr(struct kvm_vcpu *vcpu) 4829 { 4830 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 4831 struct msr_data msr_info; 4832 4833 msr_info.index = ecx; 4834 msr_info.host_initiated = false; 4835 if (vmx_get_msr(vcpu, &msr_info)) { 4836 trace_kvm_msr_read_ex(ecx); 4837 kvm_inject_gp(vcpu, 0); 4838 return 1; 4839 } 4840 4841 trace_kvm_msr_read(ecx, msr_info.data); 4842 4843 /* FIXME: handling of bits 32:63 of rax, rdx */ 4844 vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; 4845 vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; 4846 return kvm_skip_emulated_instruction(vcpu); 4847 } 4848 4849 static int handle_wrmsr(struct kvm_vcpu *vcpu) 4850 { 4851 struct msr_data msr; 4852 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 4853 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 4854 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 4855 4856 msr.data = data; 4857 msr.index = ecx; 4858 msr.host_initiated = false; 4859 if (kvm_set_msr(vcpu, &msr) != 0) { 4860 trace_kvm_msr_write_ex(ecx, data); 4861 kvm_inject_gp(vcpu, 0); 4862 return 1; 4863 } 4864 4865 trace_kvm_msr_write(ecx, data); 4866 return kvm_skip_emulated_instruction(vcpu); 4867 } 4868 4869 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 4870 { 4871 kvm_apic_update_ppr(vcpu); 4872 return 1; 4873 } 4874 4875 static int handle_interrupt_window(struct kvm_vcpu *vcpu) 4876 { 4877 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, 4878 CPU_BASED_VIRTUAL_INTR_PENDING); 4879 4880 kvm_make_request(KVM_REQ_EVENT, vcpu); 4881 4882 ++vcpu->stat.irq_window_exits; 4883 return 1; 4884 } 4885 4886 static int handle_halt(struct kvm_vcpu *vcpu) 4887 { 4888 return kvm_emulate_halt(vcpu); 4889 } 4890 4891 static int handle_vmcall(struct kvm_vcpu *vcpu) 4892 { 4893 return kvm_emulate_hypercall(vcpu); 4894 } 4895 4896 static int handle_invd(struct kvm_vcpu *vcpu) 4897 { 4898 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; 4899 } 4900 4901 static int handle_invlpg(struct kvm_vcpu *vcpu) 4902 { 4903 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4904 4905 kvm_mmu_invlpg(vcpu, exit_qualification); 4906 return kvm_skip_emulated_instruction(vcpu); 4907 } 4908 4909 static int handle_rdpmc(struct kvm_vcpu *vcpu) 4910 { 4911 int err; 4912 4913 err = kvm_rdpmc(vcpu); 4914 return kvm_complete_insn_gp(vcpu, err); 4915 } 4916 4917 static int handle_wbinvd(struct kvm_vcpu *vcpu) 4918 { 4919 return kvm_emulate_wbinvd(vcpu); 4920 } 4921 4922 static int handle_xsetbv(struct kvm_vcpu *vcpu) 4923 { 4924 u64 new_bv = kvm_read_edx_eax(vcpu); 4925 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 4926 4927 if (kvm_set_xcr(vcpu, index, new_bv) == 0) 4928 return kvm_skip_emulated_instruction(vcpu); 4929 return 1; 4930 } 4931 4932 static int handle_xsaves(struct kvm_vcpu *vcpu) 4933 { 4934 kvm_skip_emulated_instruction(vcpu); 4935 WARN(1, "this should never happen\n"); 4936 return 1; 4937 } 4938 4939 static int handle_xrstors(struct kvm_vcpu *vcpu) 4940 { 4941 kvm_skip_emulated_instruction(vcpu); 4942 WARN(1, "this should never happen\n"); 4943 return 1; 4944 } 4945 4946 static int handle_apic_access(struct kvm_vcpu *vcpu) 4947 { 4948 if (likely(fasteoi)) { 4949 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4950 int access_type, offset; 4951 4952 access_type = exit_qualification & APIC_ACCESS_TYPE; 4953 offset = exit_qualification & APIC_ACCESS_OFFSET; 4954 /* 4955 * Sane guest uses MOV to write EOI, with written value 4956 * not cared. So make a short-circuit here by avoiding 4957 * heavy instruction emulation. 4958 */ 4959 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 4960 (offset == APIC_EOI)) { 4961 kvm_lapic_set_eoi(vcpu); 4962 return kvm_skip_emulated_instruction(vcpu); 4963 } 4964 } 4965 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; 4966 } 4967 4968 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 4969 { 4970 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4971 int vector = exit_qualification & 0xff; 4972 4973 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 4974 kvm_apic_set_eoi_accelerated(vcpu, vector); 4975 return 1; 4976 } 4977 4978 static int handle_apic_write(struct kvm_vcpu *vcpu) 4979 { 4980 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4981 u32 offset = exit_qualification & 0xfff; 4982 4983 /* APIC-write VM exit is trap-like and thus no need to adjust IP */ 4984 kvm_apic_write_nodecode(vcpu, offset); 4985 return 1; 4986 } 4987 4988 static int handle_task_switch(struct kvm_vcpu *vcpu) 4989 { 4990 struct vcpu_vmx *vmx = to_vmx(vcpu); 4991 unsigned long exit_qualification; 4992 bool has_error_code = false; 4993 u32 error_code = 0; 4994 u16 tss_selector; 4995 int reason, type, idt_v, idt_index; 4996 4997 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 4998 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 4999 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5000 5001 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5002 5003 reason = (u32)exit_qualification >> 30; 5004 if (reason == TASK_SWITCH_GATE && idt_v) { 5005 switch (type) { 5006 case INTR_TYPE_NMI_INTR: 5007 vcpu->arch.nmi_injected = false; 5008 vmx_set_nmi_mask(vcpu, true); 5009 break; 5010 case INTR_TYPE_EXT_INTR: 5011 case INTR_TYPE_SOFT_INTR: 5012 kvm_clear_interrupt_queue(vcpu); 5013 break; 5014 case INTR_TYPE_HARD_EXCEPTION: 5015 if (vmx->idt_vectoring_info & 5016 VECTORING_INFO_DELIVER_CODE_MASK) { 5017 has_error_code = true; 5018 error_code = 5019 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5020 } 5021 /* fall through */ 5022 case INTR_TYPE_SOFT_EXCEPTION: 5023 kvm_clear_exception_queue(vcpu); 5024 break; 5025 default: 5026 break; 5027 } 5028 } 5029 tss_selector = exit_qualification; 5030 5031 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5032 type != INTR_TYPE_EXT_INTR && 5033 type != INTR_TYPE_NMI_INTR)) 5034 skip_emulated_instruction(vcpu); 5035 5036 if (kvm_task_switch(vcpu, tss_selector, 5037 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, 5038 has_error_code, error_code) == EMULATE_FAIL) { 5039 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5040 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5041 vcpu->run->internal.ndata = 0; 5042 return 0; 5043 } 5044 5045 /* 5046 * TODO: What about debug traps on tss switch? 5047 * Are we supposed to inject them and update dr6? 5048 */ 5049 5050 return 1; 5051 } 5052 5053 static int handle_ept_violation(struct kvm_vcpu *vcpu) 5054 { 5055 unsigned long exit_qualification; 5056 gpa_t gpa; 5057 u64 error_code; 5058 5059 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5060 5061 /* 5062 * EPT violation happened while executing iret from NMI, 5063 * "blocked by NMI" bit has to be set before next VM entry. 5064 * There are errata that may cause this bit to not be set: 5065 * AAK134, BY25. 5066 */ 5067 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5068 enable_vnmi && 5069 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5070 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5071 5072 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5073 trace_kvm_page_fault(gpa, exit_qualification); 5074 5075 /* Is it a read fault? */ 5076 error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) 5077 ? PFERR_USER_MASK : 0; 5078 /* Is it a write fault? */ 5079 error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) 5080 ? PFERR_WRITE_MASK : 0; 5081 /* Is it a fetch fault? */ 5082 error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) 5083 ? PFERR_FETCH_MASK : 0; 5084 /* ept page table entry is present? */ 5085 error_code |= (exit_qualification & 5086 (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | 5087 EPT_VIOLATION_EXECUTABLE)) 5088 ? PFERR_PRESENT_MASK : 0; 5089 5090 error_code |= (exit_qualification & 0x100) != 0 ? 5091 PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; 5092 5093 vcpu->arch.exit_qualification = exit_qualification; 5094 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5095 } 5096 5097 static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5098 { 5099 gpa_t gpa; 5100 5101 /* 5102 * A nested guest cannot optimize MMIO vmexits, because we have an 5103 * nGPA here instead of the required GPA. 5104 */ 5105 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5106 if (!is_guest_mode(vcpu) && 5107 !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5108 trace_kvm_fast_mmio(gpa); 5109 /* 5110 * Doing kvm_skip_emulated_instruction() depends on undefined 5111 * behavior: Intel's manual doesn't mandate 5112 * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG 5113 * occurs and while on real hardware it was observed to be set, 5114 * other hypervisors (namely Hyper-V) don't set it, we end up 5115 * advancing IP with some random value. Disable fast mmio when 5116 * running nested and keep it for real hardware in hope that 5117 * VM_EXIT_INSTRUCTION_LEN will always be set correctly. 5118 */ 5119 if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) 5120 return kvm_skip_emulated_instruction(vcpu); 5121 else 5122 return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == 5123 EMULATE_DONE; 5124 } 5125 5126 return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); 5127 } 5128 5129 static int handle_nmi_window(struct kvm_vcpu *vcpu) 5130 { 5131 WARN_ON_ONCE(!enable_vnmi); 5132 vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, 5133 CPU_BASED_VIRTUAL_NMI_PENDING); 5134 ++vcpu->stat.nmi_window_exits; 5135 kvm_make_request(KVM_REQ_EVENT, vcpu); 5136 5137 return 1; 5138 } 5139 5140 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5141 { 5142 struct vcpu_vmx *vmx = to_vmx(vcpu); 5143 enum emulation_result err = EMULATE_DONE; 5144 int ret = 1; 5145 u32 cpu_exec_ctrl; 5146 bool intr_window_requested; 5147 unsigned count = 130; 5148 5149 /* 5150 * We should never reach the point where we are emulating L2 5151 * due to invalid guest state as that means we incorrectly 5152 * allowed a nested VMEntry with an invalid vmcs12. 5153 */ 5154 WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending); 5155 5156 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5157 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5158 5159 while (vmx->emulation_required && count-- != 0) { 5160 if (intr_window_requested && vmx_interrupt_allowed(vcpu)) 5161 return handle_interrupt_window(&vmx->vcpu); 5162 5163 if (kvm_test_request(KVM_REQ_EVENT, vcpu)) 5164 return 1; 5165 5166 err = kvm_emulate_instruction(vcpu, 0); 5167 5168 if (err == EMULATE_USER_EXIT) { 5169 ++vcpu->stat.mmio_exits; 5170 ret = 0; 5171 goto out; 5172 } 5173 5174 if (err != EMULATE_DONE) 5175 goto emulation_error; 5176 5177 if (vmx->emulation_required && !vmx->rmode.vm86_active && 5178 vcpu->arch.exception.pending) 5179 goto emulation_error; 5180 5181 if (vcpu->arch.halt_request) { 5182 vcpu->arch.halt_request = 0; 5183 ret = kvm_vcpu_halt(vcpu); 5184 goto out; 5185 } 5186 5187 if (signal_pending(current)) 5188 goto out; 5189 if (need_resched()) 5190 schedule(); 5191 } 5192 5193 out: 5194 return ret; 5195 5196 emulation_error: 5197 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5198 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5199 vcpu->run->internal.ndata = 0; 5200 return 0; 5201 } 5202 5203 static void grow_ple_window(struct kvm_vcpu *vcpu) 5204 { 5205 struct vcpu_vmx *vmx = to_vmx(vcpu); 5206 int old = vmx->ple_window; 5207 5208 vmx->ple_window = __grow_ple_window(old, ple_window, 5209 ple_window_grow, 5210 ple_window_max); 5211 5212 if (vmx->ple_window != old) 5213 vmx->ple_window_dirty = true; 5214 5215 trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); 5216 } 5217 5218 static void shrink_ple_window(struct kvm_vcpu *vcpu) 5219 { 5220 struct vcpu_vmx *vmx = to_vmx(vcpu); 5221 int old = vmx->ple_window; 5222 5223 vmx->ple_window = __shrink_ple_window(old, ple_window, 5224 ple_window_shrink, 5225 ple_window); 5226 5227 if (vmx->ple_window != old) 5228 vmx->ple_window_dirty = true; 5229 5230 trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); 5231 } 5232 5233 /* 5234 * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. 5235 */ 5236 static void wakeup_handler(void) 5237 { 5238 struct kvm_vcpu *vcpu; 5239 int cpu = smp_processor_id(); 5240 5241 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 5242 list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu), 5243 blocked_vcpu_list) { 5244 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 5245 5246 if (pi_test_on(pi_desc) == 1) 5247 kvm_vcpu_kick(vcpu); 5248 } 5249 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 5250 } 5251 5252 static void vmx_enable_tdp(void) 5253 { 5254 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, 5255 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, 5256 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 5257 0ull, VMX_EPT_EXECUTABLE_MASK, 5258 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, 5259 VMX_EPT_RWX_MASK, 0ull); 5260 5261 ept_set_mmio_spte_mask(); 5262 kvm_enable_tdp(); 5263 } 5264 5265 /* 5266 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 5267 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 5268 */ 5269 static int handle_pause(struct kvm_vcpu *vcpu) 5270 { 5271 if (!kvm_pause_in_guest(vcpu->kvm)) 5272 grow_ple_window(vcpu); 5273 5274 /* 5275 * Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting" 5276 * VM-execution control is ignored if CPL > 0. OTOH, KVM 5277 * never set PAUSE_EXITING and just set PLE if supported, 5278 * so the vcpu must be CPL=0 if it gets a PAUSE exit. 5279 */ 5280 kvm_vcpu_on_spin(vcpu, true); 5281 return kvm_skip_emulated_instruction(vcpu); 5282 } 5283 5284 static int handle_nop(struct kvm_vcpu *vcpu) 5285 { 5286 return kvm_skip_emulated_instruction(vcpu); 5287 } 5288 5289 static int handle_mwait(struct kvm_vcpu *vcpu) 5290 { 5291 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 5292 return handle_nop(vcpu); 5293 } 5294 5295 static int handle_invalid_op(struct kvm_vcpu *vcpu) 5296 { 5297 kvm_queue_exception(vcpu, UD_VECTOR); 5298 return 1; 5299 } 5300 5301 static int handle_monitor_trap(struct kvm_vcpu *vcpu) 5302 { 5303 return 1; 5304 } 5305 5306 static int handle_monitor(struct kvm_vcpu *vcpu) 5307 { 5308 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 5309 return handle_nop(vcpu); 5310 } 5311 5312 static int handle_invpcid(struct kvm_vcpu *vcpu) 5313 { 5314 u32 vmx_instruction_info; 5315 unsigned long type; 5316 bool pcid_enabled; 5317 gva_t gva; 5318 struct x86_exception e; 5319 unsigned i; 5320 unsigned long roots_to_free = 0; 5321 struct { 5322 u64 pcid; 5323 u64 gla; 5324 } operand; 5325 5326 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 5327 kvm_queue_exception(vcpu, UD_VECTOR); 5328 return 1; 5329 } 5330 5331 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5332 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 5333 5334 if (type > 3) { 5335 kvm_inject_gp(vcpu, 0); 5336 return 1; 5337 } 5338 5339 /* According to the Intel instruction reference, the memory operand 5340 * is read even if it isn't needed (e.g., for type==all) 5341 */ 5342 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 5343 vmx_instruction_info, false, &gva)) 5344 return 1; 5345 5346 if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { 5347 kvm_inject_page_fault(vcpu, &e); 5348 return 1; 5349 } 5350 5351 if (operand.pcid >> 12 != 0) { 5352 kvm_inject_gp(vcpu, 0); 5353 return 1; 5354 } 5355 5356 pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); 5357 5358 switch (type) { 5359 case INVPCID_TYPE_INDIV_ADDR: 5360 if ((!pcid_enabled && (operand.pcid != 0)) || 5361 is_noncanonical_address(operand.gla, vcpu)) { 5362 kvm_inject_gp(vcpu, 0); 5363 return 1; 5364 } 5365 kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); 5366 return kvm_skip_emulated_instruction(vcpu); 5367 5368 case INVPCID_TYPE_SINGLE_CTXT: 5369 if (!pcid_enabled && (operand.pcid != 0)) { 5370 kvm_inject_gp(vcpu, 0); 5371 return 1; 5372 } 5373 5374 if (kvm_get_active_pcid(vcpu) == operand.pcid) { 5375 kvm_mmu_sync_roots(vcpu); 5376 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 5377 } 5378 5379 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5380 if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) 5381 == operand.pcid) 5382 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5383 5384 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); 5385 /* 5386 * If neither the current cr3 nor any of the prev_roots use the 5387 * given PCID, then nothing needs to be done here because a 5388 * resync will happen anyway before switching to any other CR3. 5389 */ 5390 5391 return kvm_skip_emulated_instruction(vcpu); 5392 5393 case INVPCID_TYPE_ALL_NON_GLOBAL: 5394 /* 5395 * Currently, KVM doesn't mark global entries in the shadow 5396 * page tables, so a non-global flush just degenerates to a 5397 * global flush. If needed, we could optimize this later by 5398 * keeping track of global entries in shadow page tables. 5399 */ 5400 5401 /* fall-through */ 5402 case INVPCID_TYPE_ALL_INCL_GLOBAL: 5403 kvm_mmu_unload(vcpu); 5404 return kvm_skip_emulated_instruction(vcpu); 5405 5406 default: 5407 BUG(); /* We have already checked above that type <= 3 */ 5408 } 5409 } 5410 5411 static int handle_pml_full(struct kvm_vcpu *vcpu) 5412 { 5413 unsigned long exit_qualification; 5414 5415 trace_kvm_pml_full(vcpu->vcpu_id); 5416 5417 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5418 5419 /* 5420 * PML buffer FULL happened while executing iret from NMI, 5421 * "blocked by NMI" bit has to be set before next VM entry. 5422 */ 5423 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5424 enable_vnmi && 5425 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5426 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 5427 GUEST_INTR_STATE_NMI); 5428 5429 /* 5430 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 5431 * here.., and there's no userspace involvement needed for PML. 5432 */ 5433 return 1; 5434 } 5435 5436 static int handle_preemption_timer(struct kvm_vcpu *vcpu) 5437 { 5438 if (!to_vmx(vcpu)->req_immediate_exit) 5439 kvm_lapic_expired_hv_timer(vcpu); 5440 return 1; 5441 } 5442 5443 /* 5444 * When nested=0, all VMX instruction VM Exits filter here. The handlers 5445 * are overwritten by nested_vmx_setup() when nested=1. 5446 */ 5447 static int handle_vmx_instruction(struct kvm_vcpu *vcpu) 5448 { 5449 kvm_queue_exception(vcpu, UD_VECTOR); 5450 return 1; 5451 } 5452 5453 static int handle_encls(struct kvm_vcpu *vcpu) 5454 { 5455 /* 5456 * SGX virtualization is not yet supported. There is no software 5457 * enable bit for SGX, so we have to trap ENCLS and inject a #UD 5458 * to prevent the guest from executing ENCLS. 5459 */ 5460 kvm_queue_exception(vcpu, UD_VECTOR); 5461 return 1; 5462 } 5463 5464 /* 5465 * The exit handlers return 1 if the exit was handled fully and guest execution 5466 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 5467 * to be done to userspace and return 0. 5468 */ 5469 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 5470 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 5471 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 5472 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 5473 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 5474 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 5475 [EXIT_REASON_CR_ACCESS] = handle_cr, 5476 [EXIT_REASON_DR_ACCESS] = handle_dr, 5477 [EXIT_REASON_CPUID] = handle_cpuid, 5478 [EXIT_REASON_MSR_READ] = handle_rdmsr, 5479 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 5480 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 5481 [EXIT_REASON_HLT] = handle_halt, 5482 [EXIT_REASON_INVD] = handle_invd, 5483 [EXIT_REASON_INVLPG] = handle_invlpg, 5484 [EXIT_REASON_RDPMC] = handle_rdpmc, 5485 [EXIT_REASON_VMCALL] = handle_vmcall, 5486 [EXIT_REASON_VMCLEAR] = handle_vmx_instruction, 5487 [EXIT_REASON_VMLAUNCH] = handle_vmx_instruction, 5488 [EXIT_REASON_VMPTRLD] = handle_vmx_instruction, 5489 [EXIT_REASON_VMPTRST] = handle_vmx_instruction, 5490 [EXIT_REASON_VMREAD] = handle_vmx_instruction, 5491 [EXIT_REASON_VMRESUME] = handle_vmx_instruction, 5492 [EXIT_REASON_VMWRITE] = handle_vmx_instruction, 5493 [EXIT_REASON_VMOFF] = handle_vmx_instruction, 5494 [EXIT_REASON_VMON] = handle_vmx_instruction, 5495 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 5496 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 5497 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 5498 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 5499 [EXIT_REASON_WBINVD] = handle_wbinvd, 5500 [EXIT_REASON_XSETBV] = handle_xsetbv, 5501 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 5502 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 5503 [EXIT_REASON_GDTR_IDTR] = handle_desc, 5504 [EXIT_REASON_LDTR_TR] = handle_desc, 5505 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 5506 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 5507 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 5508 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, 5509 [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap, 5510 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 5511 [EXIT_REASON_INVEPT] = handle_vmx_instruction, 5512 [EXIT_REASON_INVVPID] = handle_vmx_instruction, 5513 [EXIT_REASON_RDRAND] = handle_invalid_op, 5514 [EXIT_REASON_RDSEED] = handle_invalid_op, 5515 [EXIT_REASON_XSAVES] = handle_xsaves, 5516 [EXIT_REASON_XRSTORS] = handle_xrstors, 5517 [EXIT_REASON_PML_FULL] = handle_pml_full, 5518 [EXIT_REASON_INVPCID] = handle_invpcid, 5519 [EXIT_REASON_VMFUNC] = handle_vmx_instruction, 5520 [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, 5521 [EXIT_REASON_ENCLS] = handle_encls, 5522 }; 5523 5524 static const int kvm_vmx_max_exit_handlers = 5525 ARRAY_SIZE(kvm_vmx_exit_handlers); 5526 5527 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 5528 { 5529 *info1 = vmcs_readl(EXIT_QUALIFICATION); 5530 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 5531 } 5532 5533 static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) 5534 { 5535 if (vmx->pml_pg) { 5536 __free_page(vmx->pml_pg); 5537 vmx->pml_pg = NULL; 5538 } 5539 } 5540 5541 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) 5542 { 5543 struct vcpu_vmx *vmx = to_vmx(vcpu); 5544 u64 *pml_buf; 5545 u16 pml_idx; 5546 5547 pml_idx = vmcs_read16(GUEST_PML_INDEX); 5548 5549 /* Do nothing if PML buffer is empty */ 5550 if (pml_idx == (PML_ENTITY_NUM - 1)) 5551 return; 5552 5553 /* PML index always points to next available PML buffer entity */ 5554 if (pml_idx >= PML_ENTITY_NUM) 5555 pml_idx = 0; 5556 else 5557 pml_idx++; 5558 5559 pml_buf = page_address(vmx->pml_pg); 5560 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 5561 u64 gpa; 5562 5563 gpa = pml_buf[pml_idx]; 5564 WARN_ON(gpa & (PAGE_SIZE - 1)); 5565 kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); 5566 } 5567 5568 /* reset PML index */ 5569 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 5570 } 5571 5572 /* 5573 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. 5574 * Called before reporting dirty_bitmap to userspace. 5575 */ 5576 static void kvm_flush_pml_buffers(struct kvm *kvm) 5577 { 5578 int i; 5579 struct kvm_vcpu *vcpu; 5580 /* 5581 * We only need to kick vcpu out of guest mode here, as PML buffer 5582 * is flushed at beginning of all VMEXITs, and it's obvious that only 5583 * vcpus running in guest are possible to have unflushed GPAs in PML 5584 * buffer. 5585 */ 5586 kvm_for_each_vcpu(i, vcpu, kvm) 5587 kvm_vcpu_kick(vcpu); 5588 } 5589 5590 static void vmx_dump_sel(char *name, uint32_t sel) 5591 { 5592 pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", 5593 name, vmcs_read16(sel), 5594 vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), 5595 vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), 5596 vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); 5597 } 5598 5599 static void vmx_dump_dtsel(char *name, uint32_t limit) 5600 { 5601 pr_err("%s limit=0x%08x, base=0x%016lx\n", 5602 name, vmcs_read32(limit), 5603 vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); 5604 } 5605 5606 static void dump_vmcs(void) 5607 { 5608 u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); 5609 u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); 5610 u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5611 u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); 5612 u32 secondary_exec_control = 0; 5613 unsigned long cr4 = vmcs_readl(GUEST_CR4); 5614 u64 efer = vmcs_read64(GUEST_IA32_EFER); 5615 int i, n; 5616 5617 if (cpu_has_secondary_exec_ctrls()) 5618 secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 5619 5620 pr_err("*** Guest State ***\n"); 5621 pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 5622 vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), 5623 vmcs_readl(CR0_GUEST_HOST_MASK)); 5624 pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", 5625 cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); 5626 pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); 5627 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && 5628 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) 5629 { 5630 pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n", 5631 vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1)); 5632 pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n", 5633 vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3)); 5634 } 5635 pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", 5636 vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); 5637 pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", 5638 vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); 5639 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 5640 vmcs_readl(GUEST_SYSENTER_ESP), 5641 vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); 5642 vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); 5643 vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); 5644 vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); 5645 vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); 5646 vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); 5647 vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); 5648 vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); 5649 vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); 5650 vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); 5651 vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); 5652 if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || 5653 (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) 5654 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", 5655 efer, vmcs_read64(GUEST_IA32_PAT)); 5656 pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n", 5657 vmcs_read64(GUEST_IA32_DEBUGCTL), 5658 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); 5659 if (cpu_has_load_perf_global_ctrl() && 5660 vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 5661 pr_err("PerfGlobCtl = 0x%016llx\n", 5662 vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL)); 5663 if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) 5664 pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS)); 5665 pr_err("Interruptibility = %08x ActivityState = %08x\n", 5666 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), 5667 vmcs_read32(GUEST_ACTIVITY_STATE)); 5668 if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 5669 pr_err("InterruptStatus = %04x\n", 5670 vmcs_read16(GUEST_INTR_STATUS)); 5671 5672 pr_err("*** Host State ***\n"); 5673 pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", 5674 vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); 5675 pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", 5676 vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), 5677 vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), 5678 vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), 5679 vmcs_read16(HOST_TR_SELECTOR)); 5680 pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", 5681 vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), 5682 vmcs_readl(HOST_TR_BASE)); 5683 pr_err("GDTBase=%016lx IDTBase=%016lx\n", 5684 vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); 5685 pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", 5686 vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), 5687 vmcs_readl(HOST_CR4)); 5688 pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", 5689 vmcs_readl(HOST_IA32_SYSENTER_ESP), 5690 vmcs_read32(HOST_IA32_SYSENTER_CS), 5691 vmcs_readl(HOST_IA32_SYSENTER_EIP)); 5692 if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) 5693 pr_err("EFER = 0x%016llx PAT = 0x%016llx\n", 5694 vmcs_read64(HOST_IA32_EFER), 5695 vmcs_read64(HOST_IA32_PAT)); 5696 if (cpu_has_load_perf_global_ctrl() && 5697 vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 5698 pr_err("PerfGlobCtl = 0x%016llx\n", 5699 vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); 5700 5701 pr_err("*** Control State ***\n"); 5702 pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", 5703 pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); 5704 pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); 5705 pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", 5706 vmcs_read32(EXCEPTION_BITMAP), 5707 vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), 5708 vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); 5709 pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", 5710 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 5711 vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), 5712 vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); 5713 pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", 5714 vmcs_read32(VM_EXIT_INTR_INFO), 5715 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 5716 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 5717 pr_err(" reason=%08x qualification=%016lx\n", 5718 vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); 5719 pr_err("IDTVectoring: info=%08x errcode=%08x\n", 5720 vmcs_read32(IDT_VECTORING_INFO_FIELD), 5721 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 5722 pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET)); 5723 if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) 5724 pr_err("TSC Multiplier = 0x%016llx\n", 5725 vmcs_read64(TSC_MULTIPLIER)); 5726 if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) 5727 pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); 5728 if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) 5729 pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); 5730 if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) 5731 pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER)); 5732 n = vmcs_read32(CR3_TARGET_COUNT); 5733 for (i = 0; i + 1 < n; i += 4) 5734 pr_err("CR3 target%u=%016lx target%u=%016lx\n", 5735 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), 5736 i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); 5737 if (i < n) 5738 pr_err("CR3 target%u=%016lx\n", 5739 i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); 5740 if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) 5741 pr_err("PLE Gap=%08x Window=%08x\n", 5742 vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); 5743 if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) 5744 pr_err("Virtual processor ID = 0x%04x\n", 5745 vmcs_read16(VIRTUAL_PROCESSOR_ID)); 5746 } 5747 5748 /* 5749 * The guest has exited. See if we can fix it or if we need userspace 5750 * assistance. 5751 */ 5752 static int vmx_handle_exit(struct kvm_vcpu *vcpu) 5753 { 5754 struct vcpu_vmx *vmx = to_vmx(vcpu); 5755 u32 exit_reason = vmx->exit_reason; 5756 u32 vectoring_info = vmx->idt_vectoring_info; 5757 5758 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); 5759 5760 /* 5761 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 5762 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 5763 * querying dirty_bitmap, we only need to kick all vcpus out of guest 5764 * mode as if vcpus is in root mode, the PML buffer must has been 5765 * flushed already. 5766 */ 5767 if (enable_pml) 5768 vmx_flush_pml_buffer(vcpu); 5769 5770 /* If guest state is invalid, start emulating */ 5771 if (vmx->emulation_required) 5772 return handle_invalid_guest_state(vcpu); 5773 5774 if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) 5775 return nested_vmx_reflect_vmexit(vcpu, exit_reason); 5776 5777 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 5778 dump_vmcs(); 5779 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 5780 vcpu->run->fail_entry.hardware_entry_failure_reason 5781 = exit_reason; 5782 return 0; 5783 } 5784 5785 if (unlikely(vmx->fail)) { 5786 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 5787 vcpu->run->fail_entry.hardware_entry_failure_reason 5788 = vmcs_read32(VM_INSTRUCTION_ERROR); 5789 return 0; 5790 } 5791 5792 /* 5793 * Note: 5794 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 5795 * delivery event since it indicates guest is accessing MMIO. 5796 * The vm-exit can be triggered again after return to guest that 5797 * will cause infinite loop. 5798 */ 5799 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 5800 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 5801 exit_reason != EXIT_REASON_EPT_VIOLATION && 5802 exit_reason != EXIT_REASON_PML_FULL && 5803 exit_reason != EXIT_REASON_TASK_SWITCH)) { 5804 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5805 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 5806 vcpu->run->internal.ndata = 3; 5807 vcpu->run->internal.data[0] = vectoring_info; 5808 vcpu->run->internal.data[1] = exit_reason; 5809 vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; 5810 if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { 5811 vcpu->run->internal.ndata++; 5812 vcpu->run->internal.data[3] = 5813 vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5814 } 5815 return 0; 5816 } 5817 5818 if (unlikely(!enable_vnmi && 5819 vmx->loaded_vmcs->soft_vnmi_blocked)) { 5820 if (vmx_interrupt_allowed(vcpu)) { 5821 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 5822 } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL && 5823 vcpu->arch.nmi_pending) { 5824 /* 5825 * This CPU don't support us in finding the end of an 5826 * NMI-blocked window if the guest runs with IRQs 5827 * disabled. So we pull the trigger after 1 s of 5828 * futile waiting, but inform the user about this. 5829 */ 5830 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 5831 "state on VCPU %d after 1 s timeout\n", 5832 __func__, vcpu->vcpu_id); 5833 vmx->loaded_vmcs->soft_vnmi_blocked = 0; 5834 } 5835 } 5836 5837 if (exit_reason < kvm_vmx_max_exit_handlers 5838 && kvm_vmx_exit_handlers[exit_reason]) 5839 return kvm_vmx_exit_handlers[exit_reason](vcpu); 5840 else { 5841 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", 5842 exit_reason); 5843 kvm_queue_exception(vcpu, UD_VECTOR); 5844 return 1; 5845 } 5846 } 5847 5848 /* 5849 * Software based L1D cache flush which is used when microcode providing 5850 * the cache control MSR is not loaded. 5851 * 5852 * The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to 5853 * flush it is required to read in 64 KiB because the replacement algorithm 5854 * is not exactly LRU. This could be sized at runtime via topology 5855 * information but as all relevant affected CPUs have 32KiB L1D cache size 5856 * there is no point in doing so. 5857 */ 5858 static void vmx_l1d_flush(struct kvm_vcpu *vcpu) 5859 { 5860 int size = PAGE_SIZE << L1D_CACHE_ORDER; 5861 5862 /* 5863 * This code is only executed when the the flush mode is 'cond' or 5864 * 'always' 5865 */ 5866 if (static_branch_likely(&vmx_l1d_flush_cond)) { 5867 bool flush_l1d; 5868 5869 /* 5870 * Clear the per-vcpu flush bit, it gets set again 5871 * either from vcpu_run() or from one of the unsafe 5872 * VMEXIT handlers. 5873 */ 5874 flush_l1d = vcpu->arch.l1tf_flush_l1d; 5875 vcpu->arch.l1tf_flush_l1d = false; 5876 5877 /* 5878 * Clear the per-cpu flush bit, it gets set again from 5879 * the interrupt handlers. 5880 */ 5881 flush_l1d |= kvm_get_cpu_l1tf_flush_l1d(); 5882 kvm_clear_cpu_l1tf_flush_l1d(); 5883 5884 if (!flush_l1d) 5885 return; 5886 } 5887 5888 vcpu->stat.l1d_flush++; 5889 5890 if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { 5891 wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); 5892 return; 5893 } 5894 5895 asm volatile( 5896 /* First ensure the pages are in the TLB */ 5897 "xorl %%eax, %%eax\n" 5898 ".Lpopulate_tlb:\n\t" 5899 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 5900 "addl $4096, %%eax\n\t" 5901 "cmpl %%eax, %[size]\n\t" 5902 "jne .Lpopulate_tlb\n\t" 5903 "xorl %%eax, %%eax\n\t" 5904 "cpuid\n\t" 5905 /* Now fill the cache */ 5906 "xorl %%eax, %%eax\n" 5907 ".Lfill_cache:\n" 5908 "movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t" 5909 "addl $64, %%eax\n\t" 5910 "cmpl %%eax, %[size]\n\t" 5911 "jne .Lfill_cache\n\t" 5912 "lfence\n" 5913 :: [flush_pages] "r" (vmx_l1d_flush_pages), 5914 [size] "r" (size) 5915 : "eax", "ebx", "ecx", "edx"); 5916 } 5917 5918 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 5919 { 5920 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5921 5922 if (is_guest_mode(vcpu) && 5923 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 5924 return; 5925 5926 if (irr == -1 || tpr < irr) { 5927 vmcs_write32(TPR_THRESHOLD, 0); 5928 return; 5929 } 5930 5931 vmcs_write32(TPR_THRESHOLD, irr); 5932 } 5933 5934 void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) 5935 { 5936 u32 sec_exec_control; 5937 5938 if (!lapic_in_kernel(vcpu)) 5939 return; 5940 5941 if (!flexpriority_enabled && 5942 !cpu_has_vmx_virtualize_x2apic_mode()) 5943 return; 5944 5945 /* Postpone execution until vmcs01 is the current VMCS. */ 5946 if (is_guest_mode(vcpu)) { 5947 to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; 5948 return; 5949 } 5950 5951 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 5952 sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 5953 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 5954 5955 switch (kvm_get_apic_mode(vcpu)) { 5956 case LAPIC_MODE_INVALID: 5957 WARN_ONCE(true, "Invalid local APIC state"); 5958 case LAPIC_MODE_DISABLED: 5959 break; 5960 case LAPIC_MODE_XAPIC: 5961 if (flexpriority_enabled) { 5962 sec_exec_control |= 5963 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 5964 vmx_flush_tlb(vcpu, true); 5965 } 5966 break; 5967 case LAPIC_MODE_X2APIC: 5968 if (cpu_has_vmx_virtualize_x2apic_mode()) 5969 sec_exec_control |= 5970 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 5971 break; 5972 } 5973 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); 5974 5975 vmx_update_msr_bitmap(vcpu); 5976 } 5977 5978 static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) 5979 { 5980 if (!is_guest_mode(vcpu)) { 5981 vmcs_write64(APIC_ACCESS_ADDR, hpa); 5982 vmx_flush_tlb(vcpu, true); 5983 } 5984 } 5985 5986 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) 5987 { 5988 u16 status; 5989 u8 old; 5990 5991 if (max_isr == -1) 5992 max_isr = 0; 5993 5994 status = vmcs_read16(GUEST_INTR_STATUS); 5995 old = status >> 8; 5996 if (max_isr != old) { 5997 status &= 0xff; 5998 status |= max_isr << 8; 5999 vmcs_write16(GUEST_INTR_STATUS, status); 6000 } 6001 } 6002 6003 static void vmx_set_rvi(int vector) 6004 { 6005 u16 status; 6006 u8 old; 6007 6008 if (vector == -1) 6009 vector = 0; 6010 6011 status = vmcs_read16(GUEST_INTR_STATUS); 6012 old = (u8)status & 0xff; 6013 if ((u8)vector != old) { 6014 status &= ~0xff; 6015 status |= (u8)vector; 6016 vmcs_write16(GUEST_INTR_STATUS, status); 6017 } 6018 } 6019 6020 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 6021 { 6022 /* 6023 * When running L2, updating RVI is only relevant when 6024 * vmcs12 virtual-interrupt-delivery enabled. 6025 * However, it can be enabled only when L1 also 6026 * intercepts external-interrupts and in that case 6027 * we should not update vmcs02 RVI but instead intercept 6028 * interrupt. Therefore, do nothing when running L2. 6029 */ 6030 if (!is_guest_mode(vcpu)) 6031 vmx_set_rvi(max_irr); 6032 } 6033 6034 static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 6035 { 6036 struct vcpu_vmx *vmx = to_vmx(vcpu); 6037 int max_irr; 6038 bool max_irr_updated; 6039 6040 WARN_ON(!vcpu->arch.apicv_active); 6041 if (pi_test_on(&vmx->pi_desc)) { 6042 pi_clear_on(&vmx->pi_desc); 6043 /* 6044 * IOMMU can write to PIR.ON, so the barrier matters even on UP. 6045 * But on x86 this is just a compiler barrier anyway. 6046 */ 6047 smp_mb__after_atomic(); 6048 max_irr_updated = 6049 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); 6050 6051 /* 6052 * If we are running L2 and L1 has a new pending interrupt 6053 * which can be injected, we should re-evaluate 6054 * what should be done with this new L1 interrupt. 6055 * If L1 intercepts external-interrupts, we should 6056 * exit from L2 to L1. Otherwise, interrupt should be 6057 * delivered directly to L2. 6058 */ 6059 if (is_guest_mode(vcpu) && max_irr_updated) { 6060 if (nested_exit_on_intr(vcpu)) 6061 kvm_vcpu_exiting_guest_mode(vcpu); 6062 else 6063 kvm_make_request(KVM_REQ_EVENT, vcpu); 6064 } 6065 } else { 6066 max_irr = kvm_lapic_find_highest_irr(vcpu); 6067 } 6068 vmx_hwapic_irr_update(vcpu, max_irr); 6069 return max_irr; 6070 } 6071 6072 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 6073 { 6074 if (!kvm_vcpu_apicv_active(vcpu)) 6075 return; 6076 6077 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 6078 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 6079 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 6080 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 6081 } 6082 6083 static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) 6084 { 6085 struct vcpu_vmx *vmx = to_vmx(vcpu); 6086 6087 pi_clear_on(&vmx->pi_desc); 6088 memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir)); 6089 } 6090 6091 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 6092 { 6093 u32 exit_intr_info = 0; 6094 u16 basic_exit_reason = (u16)vmx->exit_reason; 6095 6096 if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY 6097 || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) 6098 return; 6099 6100 if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) 6101 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6102 vmx->exit_intr_info = exit_intr_info; 6103 6104 /* if exit due to PF check for async PF */ 6105 if (is_page_fault(exit_intr_info)) 6106 vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); 6107 6108 /* Handle machine checks before interrupts are enabled */ 6109 if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || 6110 is_machine_check(exit_intr_info)) 6111 kvm_machine_check(); 6112 6113 /* We need to handle NMIs before interrupts are enabled */ 6114 if (is_nmi(exit_intr_info)) { 6115 kvm_before_interrupt(&vmx->vcpu); 6116 asm("int $2"); 6117 kvm_after_interrupt(&vmx->vcpu); 6118 } 6119 } 6120 6121 static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) 6122 { 6123 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6124 6125 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) 6126 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { 6127 unsigned int vector; 6128 unsigned long entry; 6129 gate_desc *desc; 6130 struct vcpu_vmx *vmx = to_vmx(vcpu); 6131 #ifdef CONFIG_X86_64 6132 unsigned long tmp; 6133 #endif 6134 6135 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 6136 desc = (gate_desc *)vmx->host_idt_base + vector; 6137 entry = gate_offset(desc); 6138 asm volatile( 6139 #ifdef CONFIG_X86_64 6140 "mov %%" _ASM_SP ", %[sp]\n\t" 6141 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" 6142 "push $%c[ss]\n\t" 6143 "push %[sp]\n\t" 6144 #endif 6145 "pushf\n\t" 6146 __ASM_SIZE(push) " $%c[cs]\n\t" 6147 CALL_NOSPEC 6148 : 6149 #ifdef CONFIG_X86_64 6150 [sp]"=&r"(tmp), 6151 #endif 6152 ASM_CALL_CONSTRAINT 6153 : 6154 THUNK_TARGET(entry), 6155 [ss]"i"(__KERNEL_DS), 6156 [cs]"i"(__KERNEL_CS) 6157 ); 6158 } 6159 } 6160 STACK_FRAME_NON_STANDARD(vmx_handle_external_intr); 6161 6162 static bool vmx_has_emulated_msr(int index) 6163 { 6164 switch (index) { 6165 case MSR_IA32_SMBASE: 6166 /* 6167 * We cannot do SMM unless we can run the guest in big 6168 * real mode. 6169 */ 6170 return enable_unrestricted_guest || emulate_invalid_guest_state; 6171 case MSR_AMD64_VIRT_SPEC_CTRL: 6172 /* This is AMD only. */ 6173 return false; 6174 default: 6175 return true; 6176 } 6177 } 6178 6179 static bool vmx_pt_supported(void) 6180 { 6181 return pt_mode == PT_MODE_HOST_GUEST; 6182 } 6183 6184 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 6185 { 6186 u32 exit_intr_info; 6187 bool unblock_nmi; 6188 u8 vector; 6189 bool idtv_info_valid; 6190 6191 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6192 6193 if (enable_vnmi) { 6194 if (vmx->loaded_vmcs->nmi_known_unmasked) 6195 return; 6196 /* 6197 * Can't use vmx->exit_intr_info since we're not sure what 6198 * the exit reason is. 6199 */ 6200 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 6201 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 6202 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 6203 /* 6204 * SDM 3: 27.7.1.2 (September 2008) 6205 * Re-set bit "block by NMI" before VM entry if vmexit caused by 6206 * a guest IRET fault. 6207 * SDM 3: 23.2.2 (September 2008) 6208 * Bit 12 is undefined in any of the following cases: 6209 * If the VM exit sets the valid bit in the IDT-vectoring 6210 * information field. 6211 * If the VM exit is due to a double fault. 6212 */ 6213 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 6214 vector != DF_VECTOR && !idtv_info_valid) 6215 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 6216 GUEST_INTR_STATE_NMI); 6217 else 6218 vmx->loaded_vmcs->nmi_known_unmasked = 6219 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 6220 & GUEST_INTR_STATE_NMI); 6221 } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked)) 6222 vmx->loaded_vmcs->vnmi_blocked_time += 6223 ktime_to_ns(ktime_sub(ktime_get(), 6224 vmx->loaded_vmcs->entry_time)); 6225 } 6226 6227 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 6228 u32 idt_vectoring_info, 6229 int instr_len_field, 6230 int error_code_field) 6231 { 6232 u8 vector; 6233 int type; 6234 bool idtv_info_valid; 6235 6236 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 6237 6238 vcpu->arch.nmi_injected = false; 6239 kvm_clear_exception_queue(vcpu); 6240 kvm_clear_interrupt_queue(vcpu); 6241 6242 if (!idtv_info_valid) 6243 return; 6244 6245 kvm_make_request(KVM_REQ_EVENT, vcpu); 6246 6247 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 6248 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 6249 6250 switch (type) { 6251 case INTR_TYPE_NMI_INTR: 6252 vcpu->arch.nmi_injected = true; 6253 /* 6254 * SDM 3: 27.7.1.2 (September 2008) 6255 * Clear bit "block by NMI" before VM entry if a NMI 6256 * delivery faulted. 6257 */ 6258 vmx_set_nmi_mask(vcpu, false); 6259 break; 6260 case INTR_TYPE_SOFT_EXCEPTION: 6261 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 6262 /* fall through */ 6263 case INTR_TYPE_HARD_EXCEPTION: 6264 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 6265 u32 err = vmcs_read32(error_code_field); 6266 kvm_requeue_exception_e(vcpu, vector, err); 6267 } else 6268 kvm_requeue_exception(vcpu, vector); 6269 break; 6270 case INTR_TYPE_SOFT_INTR: 6271 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 6272 /* fall through */ 6273 case INTR_TYPE_EXT_INTR: 6274 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 6275 break; 6276 default: 6277 break; 6278 } 6279 } 6280 6281 static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 6282 { 6283 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 6284 VM_EXIT_INSTRUCTION_LEN, 6285 IDT_VECTORING_ERROR_CODE); 6286 } 6287 6288 static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 6289 { 6290 __vmx_complete_interrupts(vcpu, 6291 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 6292 VM_ENTRY_INSTRUCTION_LEN, 6293 VM_ENTRY_EXCEPTION_ERROR_CODE); 6294 6295 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 6296 } 6297 6298 static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 6299 { 6300 int i, nr_msrs; 6301 struct perf_guest_switch_msr *msrs; 6302 6303 msrs = perf_guest_get_msrs(&nr_msrs); 6304 6305 if (!msrs) 6306 return; 6307 6308 for (i = 0; i < nr_msrs; i++) 6309 if (msrs[i].host == msrs[i].guest) 6310 clear_atomic_switch_msr(vmx, msrs[i].msr); 6311 else 6312 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 6313 msrs[i].host, false); 6314 } 6315 6316 static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) 6317 { 6318 vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); 6319 if (!vmx->loaded_vmcs->hv_timer_armed) 6320 vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, 6321 PIN_BASED_VMX_PREEMPTION_TIMER); 6322 vmx->loaded_vmcs->hv_timer_armed = true; 6323 } 6324 6325 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) 6326 { 6327 struct vcpu_vmx *vmx = to_vmx(vcpu); 6328 u64 tscl; 6329 u32 delta_tsc; 6330 6331 if (vmx->req_immediate_exit) { 6332 vmx_arm_hv_timer(vmx, 0); 6333 return; 6334 } 6335 6336 if (vmx->hv_deadline_tsc != -1) { 6337 tscl = rdtsc(); 6338 if (vmx->hv_deadline_tsc > tscl) 6339 /* set_hv_timer ensures the delta fits in 32-bits */ 6340 delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> 6341 cpu_preemption_timer_multi); 6342 else 6343 delta_tsc = 0; 6344 6345 vmx_arm_hv_timer(vmx, delta_tsc); 6346 return; 6347 } 6348 6349 if (vmx->loaded_vmcs->hv_timer_armed) 6350 vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, 6351 PIN_BASED_VMX_PREEMPTION_TIMER); 6352 vmx->loaded_vmcs->hv_timer_armed = false; 6353 } 6354 6355 void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) 6356 { 6357 if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { 6358 vmx->loaded_vmcs->host_state.rsp = host_rsp; 6359 vmcs_writel(HOST_RSP, host_rsp); 6360 } 6361 } 6362 6363 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); 6364 6365 static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 6366 { 6367 struct vcpu_vmx *vmx = to_vmx(vcpu); 6368 unsigned long cr3, cr4; 6369 6370 /* Record the guest's net vcpu time for enforced NMI injections. */ 6371 if (unlikely(!enable_vnmi && 6372 vmx->loaded_vmcs->soft_vnmi_blocked)) 6373 vmx->loaded_vmcs->entry_time = ktime_get(); 6374 6375 /* Don't enter VMX if guest state is invalid, let the exit handler 6376 start emulation until we arrive back to a valid state */ 6377 if (vmx->emulation_required) 6378 return; 6379 6380 if (vmx->ple_window_dirty) { 6381 vmx->ple_window_dirty = false; 6382 vmcs_write32(PLE_WINDOW, vmx->ple_window); 6383 } 6384 6385 if (vmx->nested.need_vmcs12_sync) 6386 nested_sync_from_vmcs12(vcpu); 6387 6388 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 6389 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 6390 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 6391 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 6392 6393 cr3 = __get_current_cr3_fast(); 6394 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 6395 vmcs_writel(HOST_CR3, cr3); 6396 vmx->loaded_vmcs->host_state.cr3 = cr3; 6397 } 6398 6399 cr4 = cr4_read_shadow(); 6400 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 6401 vmcs_writel(HOST_CR4, cr4); 6402 vmx->loaded_vmcs->host_state.cr4 = cr4; 6403 } 6404 6405 /* When single-stepping over STI and MOV SS, we must clear the 6406 * corresponding interruptibility bits in the guest state. Otherwise 6407 * vmentry fails as it then expects bit 14 (BS) in pending debug 6408 * exceptions being set, but that's not correct for the guest debugging 6409 * case. */ 6410 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 6411 vmx_set_interrupt_shadow(vcpu, 0); 6412 6413 if (static_cpu_has(X86_FEATURE_PKU) && 6414 kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && 6415 vcpu->arch.pkru != vmx->host_pkru) 6416 __write_pkru(vcpu->arch.pkru); 6417 6418 pt_guest_enter(vmx); 6419 6420 atomic_switch_perf_msrs(vmx); 6421 6422 vmx_update_hv_timer(vcpu); 6423 6424 /* 6425 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 6426 * it's non-zero. Since vmentry is serialising on affected CPUs, there 6427 * is no need to worry about the conditional branch over the wrmsr 6428 * being speculatively taken. 6429 */ 6430 x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); 6431 6432 if (static_branch_unlikely(&vmx_l1d_should_flush)) 6433 vmx_l1d_flush(vcpu); 6434 6435 if (vcpu->arch.cr2 != read_cr2()) 6436 write_cr2(vcpu->arch.cr2); 6437 6438 vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 6439 vmx->loaded_vmcs->launched); 6440 6441 vcpu->arch.cr2 = read_cr2(); 6442 6443 /* 6444 * We do not use IBRS in the kernel. If this vCPU has used the 6445 * SPEC_CTRL MSR it may have left it on; save the value and 6446 * turn it off. This is much more efficient than blindly adding 6447 * it to the atomic save/restore list. Especially as the former 6448 * (Saving guest MSRs on vmexit) doesn't even exist in KVM. 6449 * 6450 * For non-nested case: 6451 * If the L01 MSR bitmap does not intercept the MSR, then we need to 6452 * save it. 6453 * 6454 * For nested case: 6455 * If the L02 MSR bitmap does not intercept the MSR, then we need to 6456 * save it. 6457 */ 6458 if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) 6459 vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); 6460 6461 x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); 6462 6463 /* Eliminate branch target predictions from guest mode */ 6464 vmexit_fill_RSB(); 6465 6466 /* All fields are clean at this point */ 6467 if (static_branch_unlikely(&enable_evmcs)) 6468 current_evmcs->hv_clean_fields |= 6469 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 6470 6471 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 6472 if (vmx->host_debugctlmsr) 6473 update_debugctlmsr(vmx->host_debugctlmsr); 6474 6475 #ifndef CONFIG_X86_64 6476 /* 6477 * The sysexit path does not restore ds/es, so we must set them to 6478 * a reasonable value ourselves. 6479 * 6480 * We can't defer this to vmx_prepare_switch_to_host() since that 6481 * function may be executed in interrupt context, which saves and 6482 * restore segments around it, nullifying its effect. 6483 */ 6484 loadsegment(ds, __USER_DS); 6485 loadsegment(es, __USER_DS); 6486 #endif 6487 6488 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 6489 | (1 << VCPU_EXREG_RFLAGS) 6490 | (1 << VCPU_EXREG_PDPTR) 6491 | (1 << VCPU_EXREG_SEGMENTS) 6492 | (1 << VCPU_EXREG_CR3)); 6493 vcpu->arch.regs_dirty = 0; 6494 6495 pt_guest_exit(vmx); 6496 6497 /* 6498 * eager fpu is enabled if PKEY is supported and CR4 is switched 6499 * back on host, so it is safe to read guest PKRU from current 6500 * XSAVE. 6501 */ 6502 if (static_cpu_has(X86_FEATURE_PKU) && 6503 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { 6504 vcpu->arch.pkru = __read_pkru(); 6505 if (vcpu->arch.pkru != vmx->host_pkru) 6506 __write_pkru(vmx->host_pkru); 6507 } 6508 6509 vmx->nested.nested_run_pending = 0; 6510 vmx->idt_vectoring_info = 0; 6511 6512 vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); 6513 if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) 6514 return; 6515 6516 vmx->loaded_vmcs->launched = 1; 6517 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 6518 6519 vmx_complete_atomic_exit(vmx); 6520 vmx_recover_nmi_blocking(vmx); 6521 vmx_complete_interrupts(vmx); 6522 } 6523 6524 static struct kvm *vmx_vm_alloc(void) 6525 { 6526 struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx), 6527 GFP_KERNEL_ACCOUNT | __GFP_ZERO, 6528 PAGE_KERNEL); 6529 return &kvm_vmx->kvm; 6530 } 6531 6532 static void vmx_vm_free(struct kvm *kvm) 6533 { 6534 vfree(to_kvm_vmx(kvm)); 6535 } 6536 6537 static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 6538 { 6539 struct vcpu_vmx *vmx = to_vmx(vcpu); 6540 6541 if (enable_pml) 6542 vmx_destroy_pml_buffer(vmx); 6543 free_vpid(vmx->vpid); 6544 nested_vmx_free_vcpu(vcpu); 6545 free_loaded_vmcs(vmx->loaded_vmcs); 6546 kfree(vmx->guest_msrs); 6547 kvm_vcpu_uninit(vcpu); 6548 kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); 6549 kmem_cache_free(kvm_vcpu_cache, vmx); 6550 } 6551 6552 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 6553 { 6554 int err; 6555 struct vcpu_vmx *vmx; 6556 unsigned long *msr_bitmap; 6557 int cpu; 6558 6559 vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); 6560 if (!vmx) 6561 return ERR_PTR(-ENOMEM); 6562 6563 vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, 6564 GFP_KERNEL_ACCOUNT); 6565 if (!vmx->vcpu.arch.guest_fpu) { 6566 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); 6567 err = -ENOMEM; 6568 goto free_partial_vcpu; 6569 } 6570 6571 vmx->vpid = allocate_vpid(); 6572 6573 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 6574 if (err) 6575 goto free_vcpu; 6576 6577 err = -ENOMEM; 6578 6579 /* 6580 * If PML is turned on, failure on enabling PML just results in failure 6581 * of creating the vcpu, therefore we can simplify PML logic (by 6582 * avoiding dealing with cases, such as enabling PML partially on vcpus 6583 * for the guest, etc. 6584 */ 6585 if (enable_pml) { 6586 vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 6587 if (!vmx->pml_pg) 6588 goto uninit_vcpu; 6589 } 6590 6591 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT); 6592 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) 6593 > PAGE_SIZE); 6594 6595 if (!vmx->guest_msrs) 6596 goto free_pml; 6597 6598 err = alloc_loaded_vmcs(&vmx->vmcs01); 6599 if (err < 0) 6600 goto free_msrs; 6601 6602 msr_bitmap = vmx->vmcs01.msr_bitmap; 6603 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R); 6604 vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW); 6605 vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW); 6606 vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 6607 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW); 6608 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW); 6609 vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW); 6610 vmx->msr_bitmap_mode = 0; 6611 6612 vmx->loaded_vmcs = &vmx->vmcs01; 6613 cpu = get_cpu(); 6614 vmx_vcpu_load(&vmx->vcpu, cpu); 6615 vmx->vcpu.cpu = cpu; 6616 vmx_vcpu_setup(vmx); 6617 vmx_vcpu_put(&vmx->vcpu); 6618 put_cpu(); 6619 if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { 6620 err = alloc_apic_access_page(kvm); 6621 if (err) 6622 goto free_vmcs; 6623 } 6624 6625 if (enable_ept && !enable_unrestricted_guest) { 6626 err = init_rmode_identity_map(kvm); 6627 if (err) 6628 goto free_vmcs; 6629 } 6630 6631 if (nested) 6632 nested_vmx_setup_ctls_msrs(&vmx->nested.msrs, 6633 vmx_capability.ept, 6634 kvm_vcpu_apicv_active(&vmx->vcpu)); 6635 else 6636 memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs)); 6637 6638 vmx->nested.posted_intr_nv = -1; 6639 vmx->nested.current_vmptr = -1ull; 6640 6641 vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; 6642 6643 /* 6644 * Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR 6645 * or POSTED_INTR_WAKEUP_VECTOR. 6646 */ 6647 vmx->pi_desc.nv = POSTED_INTR_VECTOR; 6648 vmx->pi_desc.sn = 1; 6649 6650 vmx->ept_pointer = INVALID_PAGE; 6651 6652 return &vmx->vcpu; 6653 6654 free_vmcs: 6655 free_loaded_vmcs(vmx->loaded_vmcs); 6656 free_msrs: 6657 kfree(vmx->guest_msrs); 6658 free_pml: 6659 vmx_destroy_pml_buffer(vmx); 6660 uninit_vcpu: 6661 kvm_vcpu_uninit(&vmx->vcpu); 6662 free_vcpu: 6663 free_vpid(vmx->vpid); 6664 kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); 6665 free_partial_vcpu: 6666 kmem_cache_free(kvm_vcpu_cache, vmx); 6667 return ERR_PTR(err); 6668 } 6669 6670 #define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" 6671 #define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n" 6672 6673 static int vmx_vm_init(struct kvm *kvm) 6674 { 6675 spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); 6676 6677 if (!ple_gap) 6678 kvm->arch.pause_in_guest = true; 6679 6680 if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) { 6681 switch (l1tf_mitigation) { 6682 case L1TF_MITIGATION_OFF: 6683 case L1TF_MITIGATION_FLUSH_NOWARN: 6684 /* 'I explicitly don't care' is set */ 6685 break; 6686 case L1TF_MITIGATION_FLUSH: 6687 case L1TF_MITIGATION_FLUSH_NOSMT: 6688 case L1TF_MITIGATION_FULL: 6689 /* 6690 * Warn upon starting the first VM in a potentially 6691 * insecure environment. 6692 */ 6693 if (sched_smt_active()) 6694 pr_warn_once(L1TF_MSG_SMT); 6695 if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER) 6696 pr_warn_once(L1TF_MSG_L1D); 6697 break; 6698 case L1TF_MITIGATION_FULL_FORCE: 6699 /* Flush is enforced */ 6700 break; 6701 } 6702 } 6703 return 0; 6704 } 6705 6706 static void __init vmx_check_processor_compat(void *rtn) 6707 { 6708 struct vmcs_config vmcs_conf; 6709 struct vmx_capability vmx_cap; 6710 6711 *(int *)rtn = 0; 6712 if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) 6713 *(int *)rtn = -EIO; 6714 if (nested) 6715 nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept, 6716 enable_apicv); 6717 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { 6718 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", 6719 smp_processor_id()); 6720 *(int *)rtn = -EIO; 6721 } 6722 } 6723 6724 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 6725 { 6726 u8 cache; 6727 u64 ipat = 0; 6728 6729 /* For VT-d and EPT combination 6730 * 1. MMIO: always map as UC 6731 * 2. EPT with VT-d: 6732 * a. VT-d without snooping control feature: can't guarantee the 6733 * result, try to trust guest. 6734 * b. VT-d with snooping control feature: snooping control feature of 6735 * VT-d engine can guarantee the cache correctness. Just set it 6736 * to WB to keep consistent with host. So the same as item 3. 6737 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep 6738 * consistent with host MTRR 6739 */ 6740 if (is_mmio) { 6741 cache = MTRR_TYPE_UNCACHABLE; 6742 goto exit; 6743 } 6744 6745 if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { 6746 ipat = VMX_EPT_IPAT_BIT; 6747 cache = MTRR_TYPE_WRBACK; 6748 goto exit; 6749 } 6750 6751 if (kvm_read_cr0(vcpu) & X86_CR0_CD) { 6752 ipat = VMX_EPT_IPAT_BIT; 6753 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 6754 cache = MTRR_TYPE_WRBACK; 6755 else 6756 cache = MTRR_TYPE_UNCACHABLE; 6757 goto exit; 6758 } 6759 6760 cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); 6761 6762 exit: 6763 return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; 6764 } 6765 6766 static int vmx_get_lpage_level(void) 6767 { 6768 if (enable_ept && !cpu_has_vmx_ept_1g_page()) 6769 return PT_DIRECTORY_LEVEL; 6770 else 6771 /* For shadow and EPT supported 1GB page */ 6772 return PT_PDPE_LEVEL; 6773 } 6774 6775 static void vmcs_set_secondary_exec_control(u32 new_ctl) 6776 { 6777 /* 6778 * These bits in the secondary execution controls field 6779 * are dynamic, the others are mostly based on the hypervisor 6780 * architecture and the guest's CPUID. Do not touch the 6781 * dynamic bits. 6782 */ 6783 u32 mask = 6784 SECONDARY_EXEC_SHADOW_VMCS | 6785 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6786 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 6787 SECONDARY_EXEC_DESC; 6788 6789 u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6790 6791 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 6792 (new_ctl & ~mask) | (cur_ctl & mask)); 6793 } 6794 6795 /* 6796 * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits 6797 * (indicating "allowed-1") if they are supported in the guest's CPUID. 6798 */ 6799 static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) 6800 { 6801 struct vcpu_vmx *vmx = to_vmx(vcpu); 6802 struct kvm_cpuid_entry2 *entry; 6803 6804 vmx->nested.msrs.cr0_fixed1 = 0xffffffff; 6805 vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE; 6806 6807 #define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ 6808 if (entry && (entry->_reg & (_cpuid_mask))) \ 6809 vmx->nested.msrs.cr4_fixed1 |= (_cr4_mask); \ 6810 } while (0) 6811 6812 entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); 6813 cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); 6814 cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); 6815 cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); 6816 cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); 6817 cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); 6818 cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); 6819 cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); 6820 cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); 6821 cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); 6822 cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); 6823 cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); 6824 cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); 6825 cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); 6826 cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); 6827 6828 entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); 6829 cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); 6830 cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); 6831 cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); 6832 cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); 6833 cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); 6834 6835 #undef cr4_fixed1_update 6836 } 6837 6838 static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) 6839 { 6840 struct vcpu_vmx *vmx = to_vmx(vcpu); 6841 6842 if (kvm_mpx_supported()) { 6843 bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); 6844 6845 if (mpx_enabled) { 6846 vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 6847 vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 6848 } else { 6849 vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; 6850 vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; 6851 } 6852 } 6853 } 6854 6855 static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) 6856 { 6857 struct vcpu_vmx *vmx = to_vmx(vcpu); 6858 struct kvm_cpuid_entry2 *best = NULL; 6859 int i; 6860 6861 for (i = 0; i < PT_CPUID_LEAVES; i++) { 6862 best = kvm_find_cpuid_entry(vcpu, 0x14, i); 6863 if (!best) 6864 return; 6865 vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax; 6866 vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx; 6867 vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx; 6868 vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx; 6869 } 6870 6871 /* Get the number of configurable Address Ranges for filtering */ 6872 vmx->pt_desc.addr_range = intel_pt_validate_cap(vmx->pt_desc.caps, 6873 PT_CAP_num_address_ranges); 6874 6875 /* Initialize and clear the no dependency bits */ 6876 vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN | RTIT_CTL_OS | 6877 RTIT_CTL_USR | RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC); 6878 6879 /* 6880 * If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise 6881 * will inject an #GP 6882 */ 6883 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) 6884 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN; 6885 6886 /* 6887 * If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and 6888 * PSBFreq can be set 6889 */ 6890 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc)) 6891 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC | 6892 RTIT_CTL_CYC_THRESH | RTIT_CTL_PSB_FREQ); 6893 6894 /* 6895 * If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn BranchEn and 6896 * MTCFreq can be set 6897 */ 6898 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc)) 6899 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN | 6900 RTIT_CTL_BRANCH_EN | RTIT_CTL_MTC_RANGE); 6901 6902 /* If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set */ 6903 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_ptwrite)) 6904 vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW | 6905 RTIT_CTL_PTW_EN); 6906 6907 /* If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set */ 6908 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_power_event_trace)) 6909 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN; 6910 6911 /* If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set */ 6912 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output)) 6913 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA; 6914 6915 /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */ 6916 if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys)) 6917 vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN; 6918 6919 /* unmask address range configure area */ 6920 for (i = 0; i < vmx->pt_desc.addr_range; i++) 6921 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); 6922 } 6923 6924 static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 6925 { 6926 struct vcpu_vmx *vmx = to_vmx(vcpu); 6927 6928 if (cpu_has_secondary_exec_ctrls()) { 6929 vmx_compute_secondary_exec_control(vmx); 6930 vmcs_set_secondary_exec_control(vmx->secondary_exec_control); 6931 } 6932 6933 if (nested_vmx_allowed(vcpu)) 6934 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 6935 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 6936 else 6937 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 6938 ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 6939 6940 if (nested_vmx_allowed(vcpu)) { 6941 nested_vmx_cr_fixed1_bits_update(vcpu); 6942 nested_vmx_entry_exit_ctls_update(vcpu); 6943 } 6944 6945 if (boot_cpu_has(X86_FEATURE_INTEL_PT) && 6946 guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) 6947 update_intel_pt_cfg(vcpu); 6948 } 6949 6950 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 6951 { 6952 if (func == 1 && nested) 6953 entry->ecx |= bit(X86_FEATURE_VMX); 6954 } 6955 6956 static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) 6957 { 6958 to_vmx(vcpu)->req_immediate_exit = true; 6959 } 6960 6961 static int vmx_check_intercept(struct kvm_vcpu *vcpu, 6962 struct x86_instruction_info *info, 6963 enum x86_intercept_stage stage) 6964 { 6965 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6966 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 6967 6968 /* 6969 * RDPID causes #UD if disabled through secondary execution controls. 6970 * Because it is marked as EmulateOnUD, we need to intercept it here. 6971 */ 6972 if (info->intercept == x86_intercept_rdtscp && 6973 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { 6974 ctxt->exception.vector = UD_VECTOR; 6975 ctxt->exception.error_code_valid = false; 6976 return X86EMUL_PROPAGATE_FAULT; 6977 } 6978 6979 /* TODO: check more intercepts... */ 6980 return X86EMUL_CONTINUE; 6981 } 6982 6983 #ifdef CONFIG_X86_64 6984 /* (a << shift) / divisor, return 1 if overflow otherwise 0 */ 6985 static inline int u64_shl_div_u64(u64 a, unsigned int shift, 6986 u64 divisor, u64 *result) 6987 { 6988 u64 low = a << shift, high = a >> (64 - shift); 6989 6990 /* To avoid the overflow on divq */ 6991 if (high >= divisor) 6992 return 1; 6993 6994 /* Low hold the result, high hold rem which is discarded */ 6995 asm("divq %2\n\t" : "=a" (low), "=d" (high) : 6996 "rm" (divisor), "0" (low), "1" (high)); 6997 *result = low; 6998 6999 return 0; 7000 } 7001 7002 static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) 7003 { 7004 struct vcpu_vmx *vmx; 7005 u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; 7006 7007 if (kvm_mwait_in_guest(vcpu->kvm)) 7008 return -EOPNOTSUPP; 7009 7010 vmx = to_vmx(vcpu); 7011 tscl = rdtsc(); 7012 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); 7013 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; 7014 lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); 7015 7016 if (delta_tsc > lapic_timer_advance_cycles) 7017 delta_tsc -= lapic_timer_advance_cycles; 7018 else 7019 delta_tsc = 0; 7020 7021 /* Convert to host delta tsc if tsc scaling is enabled */ 7022 if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && 7023 u64_shl_div_u64(delta_tsc, 7024 kvm_tsc_scaling_ratio_frac_bits, 7025 vcpu->arch.tsc_scaling_ratio, 7026 &delta_tsc)) 7027 return -ERANGE; 7028 7029 /* 7030 * If the delta tsc can't fit in the 32 bit after the multi shift, 7031 * we can't use the preemption timer. 7032 * It's possible that it fits on later vmentries, but checking 7033 * on every vmentry is costly so we just use an hrtimer. 7034 */ 7035 if (delta_tsc >> (cpu_preemption_timer_multi + 32)) 7036 return -ERANGE; 7037 7038 vmx->hv_deadline_tsc = tscl + delta_tsc; 7039 return delta_tsc == 0; 7040 } 7041 7042 static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) 7043 { 7044 to_vmx(vcpu)->hv_deadline_tsc = -1; 7045 } 7046 #endif 7047 7048 static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 7049 { 7050 if (!kvm_pause_in_guest(vcpu->kvm)) 7051 shrink_ple_window(vcpu); 7052 } 7053 7054 static void vmx_slot_enable_log_dirty(struct kvm *kvm, 7055 struct kvm_memory_slot *slot) 7056 { 7057 kvm_mmu_slot_leaf_clear_dirty(kvm, slot); 7058 kvm_mmu_slot_largepage_remove_write_access(kvm, slot); 7059 } 7060 7061 static void vmx_slot_disable_log_dirty(struct kvm *kvm, 7062 struct kvm_memory_slot *slot) 7063 { 7064 kvm_mmu_slot_set_dirty(kvm, slot); 7065 } 7066 7067 static void vmx_flush_log_dirty(struct kvm *kvm) 7068 { 7069 kvm_flush_pml_buffers(kvm); 7070 } 7071 7072 static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu) 7073 { 7074 struct vmcs12 *vmcs12; 7075 struct vcpu_vmx *vmx = to_vmx(vcpu); 7076 gpa_t gpa; 7077 struct page *page = NULL; 7078 u64 *pml_address; 7079 7080 if (is_guest_mode(vcpu)) { 7081 WARN_ON_ONCE(vmx->nested.pml_full); 7082 7083 /* 7084 * Check if PML is enabled for the nested guest. 7085 * Whether eptp bit 6 is set is already checked 7086 * as part of A/D emulation. 7087 */ 7088 vmcs12 = get_vmcs12(vcpu); 7089 if (!nested_cpu_has_pml(vmcs12)) 7090 return 0; 7091 7092 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 7093 vmx->nested.pml_full = true; 7094 return 1; 7095 } 7096 7097 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull; 7098 7099 page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->pml_address); 7100 if (is_error_page(page)) 7101 return 0; 7102 7103 pml_address = kmap(page); 7104 pml_address[vmcs12->guest_pml_index--] = gpa; 7105 kunmap(page); 7106 kvm_release_page_clean(page); 7107 } 7108 7109 return 0; 7110 } 7111 7112 static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, 7113 struct kvm_memory_slot *memslot, 7114 gfn_t offset, unsigned long mask) 7115 { 7116 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); 7117 } 7118 7119 static void __pi_post_block(struct kvm_vcpu *vcpu) 7120 { 7121 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 7122 struct pi_desc old, new; 7123 unsigned int dest; 7124 7125 do { 7126 old.control = new.control = pi_desc->control; 7127 WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR, 7128 "Wakeup handler not enabled while the VCPU is blocked\n"); 7129 7130 dest = cpu_physical_id(vcpu->cpu); 7131 7132 if (x2apic_enabled()) 7133 new.ndst = dest; 7134 else 7135 new.ndst = (dest << 8) & 0xFF00; 7136 7137 /* set 'NV' to 'notification vector' */ 7138 new.nv = POSTED_INTR_VECTOR; 7139 } while (cmpxchg64(&pi_desc->control, old.control, 7140 new.control) != old.control); 7141 7142 if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) { 7143 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 7144 list_del(&vcpu->blocked_vcpu_list); 7145 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 7146 vcpu->pre_pcpu = -1; 7147 } 7148 } 7149 7150 /* 7151 * This routine does the following things for vCPU which is going 7152 * to be blocked if VT-d PI is enabled. 7153 * - Store the vCPU to the wakeup list, so when interrupts happen 7154 * we can find the right vCPU to wake up. 7155 * - Change the Posted-interrupt descriptor as below: 7156 * 'NDST' <-- vcpu->pre_pcpu 7157 * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR 7158 * - If 'ON' is set during this process, which means at least one 7159 * interrupt is posted for this vCPU, we cannot block it, in 7160 * this case, return 1, otherwise, return 0. 7161 * 7162 */ 7163 static int pi_pre_block(struct kvm_vcpu *vcpu) 7164 { 7165 unsigned int dest; 7166 struct pi_desc old, new; 7167 struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); 7168 7169 if (!kvm_arch_has_assigned_device(vcpu->kvm) || 7170 !irq_remapping_cap(IRQ_POSTING_CAP) || 7171 !kvm_vcpu_apicv_active(vcpu)) 7172 return 0; 7173 7174 WARN_ON(irqs_disabled()); 7175 local_irq_disable(); 7176 if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) { 7177 vcpu->pre_pcpu = vcpu->cpu; 7178 spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 7179 list_add_tail(&vcpu->blocked_vcpu_list, 7180 &per_cpu(blocked_vcpu_on_cpu, 7181 vcpu->pre_pcpu)); 7182 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu)); 7183 } 7184 7185 do { 7186 old.control = new.control = pi_desc->control; 7187 7188 WARN((pi_desc->sn == 1), 7189 "Warning: SN field of posted-interrupts " 7190 "is set before blocking\n"); 7191 7192 /* 7193 * Since vCPU can be preempted during this process, 7194 * vcpu->cpu could be different with pre_pcpu, we 7195 * need to set pre_pcpu as the destination of wakeup 7196 * notification event, then we can find the right vCPU 7197 * to wakeup in wakeup handler if interrupts happen 7198 * when the vCPU is in blocked state. 7199 */ 7200 dest = cpu_physical_id(vcpu->pre_pcpu); 7201 7202 if (x2apic_enabled()) 7203 new.ndst = dest; 7204 else 7205 new.ndst = (dest << 8) & 0xFF00; 7206 7207 /* set 'NV' to 'wakeup vector' */ 7208 new.nv = POSTED_INTR_WAKEUP_VECTOR; 7209 } while (cmpxchg64(&pi_desc->control, old.control, 7210 new.control) != old.control); 7211 7212 /* We should not block the vCPU if an interrupt is posted for it. */ 7213 if (pi_test_on(pi_desc) == 1) 7214 __pi_post_block(vcpu); 7215 7216 local_irq_enable(); 7217 return (vcpu->pre_pcpu == -1); 7218 } 7219 7220 static int vmx_pre_block(struct kvm_vcpu *vcpu) 7221 { 7222 if (pi_pre_block(vcpu)) 7223 return 1; 7224 7225 if (kvm_lapic_hv_timer_in_use(vcpu)) 7226 kvm_lapic_switch_to_sw_timer(vcpu); 7227 7228 return 0; 7229 } 7230 7231 static void pi_post_block(struct kvm_vcpu *vcpu) 7232 { 7233 if (vcpu->pre_pcpu == -1) 7234 return; 7235 7236 WARN_ON(irqs_disabled()); 7237 local_irq_disable(); 7238 __pi_post_block(vcpu); 7239 local_irq_enable(); 7240 } 7241 7242 static void vmx_post_block(struct kvm_vcpu *vcpu) 7243 { 7244 if (kvm_x86_ops->set_hv_timer) 7245 kvm_lapic_switch_to_hv_timer(vcpu); 7246 7247 pi_post_block(vcpu); 7248 } 7249 7250 /* 7251 * vmx_update_pi_irte - set IRTE for Posted-Interrupts 7252 * 7253 * @kvm: kvm 7254 * @host_irq: host irq of the interrupt 7255 * @guest_irq: gsi of the interrupt 7256 * @set: set or unset PI 7257 * returns 0 on success, < 0 on failure 7258 */ 7259 static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, 7260 uint32_t guest_irq, bool set) 7261 { 7262 struct kvm_kernel_irq_routing_entry *e; 7263 struct kvm_irq_routing_table *irq_rt; 7264 struct kvm_lapic_irq irq; 7265 struct kvm_vcpu *vcpu; 7266 struct vcpu_data vcpu_info; 7267 int idx, ret = 0; 7268 7269 if (!kvm_arch_has_assigned_device(kvm) || 7270 !irq_remapping_cap(IRQ_POSTING_CAP) || 7271 !kvm_vcpu_apicv_active(kvm->vcpus[0])) 7272 return 0; 7273 7274 idx = srcu_read_lock(&kvm->irq_srcu); 7275 irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); 7276 if (guest_irq >= irq_rt->nr_rt_entries || 7277 hlist_empty(&irq_rt->map[guest_irq])) { 7278 pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", 7279 guest_irq, irq_rt->nr_rt_entries); 7280 goto out; 7281 } 7282 7283 hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { 7284 if (e->type != KVM_IRQ_ROUTING_MSI) 7285 continue; 7286 /* 7287 * VT-d PI cannot support posting multicast/broadcast 7288 * interrupts to a vCPU, we still use interrupt remapping 7289 * for these kind of interrupts. 7290 * 7291 * For lowest-priority interrupts, we only support 7292 * those with single CPU as the destination, e.g. user 7293 * configures the interrupts via /proc/irq or uses 7294 * irqbalance to make the interrupts single-CPU. 7295 * 7296 * We will support full lowest-priority interrupt later. 7297 */ 7298 7299 kvm_set_msi_irq(kvm, e, &irq); 7300 if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { 7301 /* 7302 * Make sure the IRTE is in remapped mode if 7303 * we don't handle it in posted mode. 7304 */ 7305 ret = irq_set_vcpu_affinity(host_irq, NULL); 7306 if (ret < 0) { 7307 printk(KERN_INFO 7308 "failed to back to remapped mode, irq: %u\n", 7309 host_irq); 7310 goto out; 7311 } 7312 7313 continue; 7314 } 7315 7316 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); 7317 vcpu_info.vector = irq.vector; 7318 7319 trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, 7320 vcpu_info.vector, vcpu_info.pi_desc_addr, set); 7321 7322 if (set) 7323 ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); 7324 else 7325 ret = irq_set_vcpu_affinity(host_irq, NULL); 7326 7327 if (ret < 0) { 7328 printk(KERN_INFO "%s: failed to update PI IRTE\n", 7329 __func__); 7330 goto out; 7331 } 7332 } 7333 7334 ret = 0; 7335 out: 7336 srcu_read_unlock(&kvm->irq_srcu, idx); 7337 return ret; 7338 } 7339 7340 static void vmx_setup_mce(struct kvm_vcpu *vcpu) 7341 { 7342 if (vcpu->arch.mcg_cap & MCG_LMCE_P) 7343 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= 7344 FEATURE_CONTROL_LMCE; 7345 else 7346 to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= 7347 ~FEATURE_CONTROL_LMCE; 7348 } 7349 7350 static int vmx_smi_allowed(struct kvm_vcpu *vcpu) 7351 { 7352 /* we need a nested vmexit to enter SMM, postpone if run is pending */ 7353 if (to_vmx(vcpu)->nested.nested_run_pending) 7354 return 0; 7355 return 1; 7356 } 7357 7358 static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) 7359 { 7360 struct vcpu_vmx *vmx = to_vmx(vcpu); 7361 7362 vmx->nested.smm.guest_mode = is_guest_mode(vcpu); 7363 if (vmx->nested.smm.guest_mode) 7364 nested_vmx_vmexit(vcpu, -1, 0, 0); 7365 7366 vmx->nested.smm.vmxon = vmx->nested.vmxon; 7367 vmx->nested.vmxon = false; 7368 vmx_clear_hlt(vcpu); 7369 return 0; 7370 } 7371 7372 static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) 7373 { 7374 struct vcpu_vmx *vmx = to_vmx(vcpu); 7375 int ret; 7376 7377 if (vmx->nested.smm.vmxon) { 7378 vmx->nested.vmxon = true; 7379 vmx->nested.smm.vmxon = false; 7380 } 7381 7382 if (vmx->nested.smm.guest_mode) { 7383 vcpu->arch.hflags &= ~HF_SMM_MASK; 7384 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7385 vcpu->arch.hflags |= HF_SMM_MASK; 7386 if (ret) 7387 return ret; 7388 7389 vmx->nested.smm.guest_mode = false; 7390 } 7391 return 0; 7392 } 7393 7394 static int enable_smi_window(struct kvm_vcpu *vcpu) 7395 { 7396 return 0; 7397 } 7398 7399 static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) 7400 { 7401 return 0; 7402 } 7403 7404 static __init int hardware_setup(void) 7405 { 7406 unsigned long host_bndcfgs; 7407 int r, i; 7408 7409 rdmsrl_safe(MSR_EFER, &host_efer); 7410 7411 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) 7412 kvm_define_shared_msr(i, vmx_msr_index[i]); 7413 7414 if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0) 7415 return -EIO; 7416 7417 if (boot_cpu_has(X86_FEATURE_NX)) 7418 kvm_enable_efer_bits(EFER_NX); 7419 7420 if (boot_cpu_has(X86_FEATURE_MPX)) { 7421 rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); 7422 WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); 7423 } 7424 7425 if (boot_cpu_has(X86_FEATURE_XSAVES)) 7426 rdmsrl(MSR_IA32_XSS, host_xss); 7427 7428 if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || 7429 !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) 7430 enable_vpid = 0; 7431 7432 if (!cpu_has_vmx_ept() || 7433 !cpu_has_vmx_ept_4levels() || 7434 !cpu_has_vmx_ept_mt_wb() || 7435 !cpu_has_vmx_invept_global()) 7436 enable_ept = 0; 7437 7438 if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) 7439 enable_ept_ad_bits = 0; 7440 7441 if (!cpu_has_vmx_unrestricted_guest() || !enable_ept) 7442 enable_unrestricted_guest = 0; 7443 7444 if (!cpu_has_vmx_flexpriority()) 7445 flexpriority_enabled = 0; 7446 7447 if (!cpu_has_virtual_nmis()) 7448 enable_vnmi = 0; 7449 7450 /* 7451 * set_apic_access_page_addr() is used to reload apic access 7452 * page upon invalidation. No need to do anything if not 7453 * using the APIC_ACCESS_ADDR VMCS field. 7454 */ 7455 if (!flexpriority_enabled) 7456 kvm_x86_ops->set_apic_access_page_addr = NULL; 7457 7458 if (!cpu_has_vmx_tpr_shadow()) 7459 kvm_x86_ops->update_cr8_intercept = NULL; 7460 7461 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 7462 kvm_disable_largepages(); 7463 7464 #if IS_ENABLED(CONFIG_HYPERV) 7465 if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH 7466 && enable_ept) { 7467 kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb; 7468 kvm_x86_ops->tlb_remote_flush_with_range = 7469 hv_remote_flush_tlb_with_range; 7470 } 7471 #endif 7472 7473 if (!cpu_has_vmx_ple()) { 7474 ple_gap = 0; 7475 ple_window = 0; 7476 ple_window_grow = 0; 7477 ple_window_max = 0; 7478 ple_window_shrink = 0; 7479 } 7480 7481 if (!cpu_has_vmx_apicv()) { 7482 enable_apicv = 0; 7483 kvm_x86_ops->sync_pir_to_irr = NULL; 7484 } 7485 7486 if (cpu_has_vmx_tsc_scaling()) { 7487 kvm_has_tsc_control = true; 7488 kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; 7489 kvm_tsc_scaling_ratio_frac_bits = 48; 7490 } 7491 7492 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 7493 7494 if (enable_ept) 7495 vmx_enable_tdp(); 7496 else 7497 kvm_disable_tdp(); 7498 7499 /* 7500 * Only enable PML when hardware supports PML feature, and both EPT 7501 * and EPT A/D bit features are enabled -- PML depends on them to work. 7502 */ 7503 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 7504 enable_pml = 0; 7505 7506 if (!enable_pml) { 7507 kvm_x86_ops->slot_enable_log_dirty = NULL; 7508 kvm_x86_ops->slot_disable_log_dirty = NULL; 7509 kvm_x86_ops->flush_log_dirty = NULL; 7510 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 7511 } 7512 7513 if (!cpu_has_vmx_preemption_timer()) 7514 kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; 7515 7516 if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { 7517 u64 vmx_msr; 7518 7519 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); 7520 cpu_preemption_timer_multi = 7521 vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; 7522 } else { 7523 kvm_x86_ops->set_hv_timer = NULL; 7524 kvm_x86_ops->cancel_hv_timer = NULL; 7525 } 7526 7527 kvm_set_posted_intr_wakeup_handler(wakeup_handler); 7528 7529 kvm_mce_cap_supported |= MCG_LMCE_P; 7530 7531 if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST) 7532 return -EINVAL; 7533 if (!enable_ept || !cpu_has_vmx_intel_pt()) 7534 pt_mode = PT_MODE_SYSTEM; 7535 7536 if (nested) { 7537 nested_vmx_setup_ctls_msrs(&vmcs_config.nested, 7538 vmx_capability.ept, enable_apicv); 7539 7540 r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); 7541 if (r) 7542 return r; 7543 } 7544 7545 r = alloc_kvm_area(); 7546 if (r) 7547 nested_vmx_hardware_unsetup(); 7548 return r; 7549 } 7550 7551 static __exit void hardware_unsetup(void) 7552 { 7553 if (nested) 7554 nested_vmx_hardware_unsetup(); 7555 7556 free_kvm_area(); 7557 } 7558 7559 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { 7560 .cpu_has_kvm_support = cpu_has_kvm_support, 7561 .disabled_by_bios = vmx_disabled_by_bios, 7562 .hardware_setup = hardware_setup, 7563 .hardware_unsetup = hardware_unsetup, 7564 .check_processor_compatibility = vmx_check_processor_compat, 7565 .hardware_enable = hardware_enable, 7566 .hardware_disable = hardware_disable, 7567 .cpu_has_accelerated_tpr = report_flexpriority, 7568 .has_emulated_msr = vmx_has_emulated_msr, 7569 7570 .vm_init = vmx_vm_init, 7571 .vm_alloc = vmx_vm_alloc, 7572 .vm_free = vmx_vm_free, 7573 7574 .vcpu_create = vmx_create_vcpu, 7575 .vcpu_free = vmx_free_vcpu, 7576 .vcpu_reset = vmx_vcpu_reset, 7577 7578 .prepare_guest_switch = vmx_prepare_switch_to_guest, 7579 .vcpu_load = vmx_vcpu_load, 7580 .vcpu_put = vmx_vcpu_put, 7581 7582 .update_bp_intercept = update_exception_bitmap, 7583 .get_msr_feature = vmx_get_msr_feature, 7584 .get_msr = vmx_get_msr, 7585 .set_msr = vmx_set_msr, 7586 .get_segment_base = vmx_get_segment_base, 7587 .get_segment = vmx_get_segment, 7588 .set_segment = vmx_set_segment, 7589 .get_cpl = vmx_get_cpl, 7590 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 7591 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 7592 .decache_cr3 = vmx_decache_cr3, 7593 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 7594 .set_cr0 = vmx_set_cr0, 7595 .set_cr3 = vmx_set_cr3, 7596 .set_cr4 = vmx_set_cr4, 7597 .set_efer = vmx_set_efer, 7598 .get_idt = vmx_get_idt, 7599 .set_idt = vmx_set_idt, 7600 .get_gdt = vmx_get_gdt, 7601 .set_gdt = vmx_set_gdt, 7602 .get_dr6 = vmx_get_dr6, 7603 .set_dr6 = vmx_set_dr6, 7604 .set_dr7 = vmx_set_dr7, 7605 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, 7606 .cache_reg = vmx_cache_reg, 7607 .get_rflags = vmx_get_rflags, 7608 .set_rflags = vmx_set_rflags, 7609 7610 .tlb_flush = vmx_flush_tlb, 7611 .tlb_flush_gva = vmx_flush_tlb_gva, 7612 7613 .run = vmx_vcpu_run, 7614 .handle_exit = vmx_handle_exit, 7615 .skip_emulated_instruction = skip_emulated_instruction, 7616 .set_interrupt_shadow = vmx_set_interrupt_shadow, 7617 .get_interrupt_shadow = vmx_get_interrupt_shadow, 7618 .patch_hypercall = vmx_patch_hypercall, 7619 .set_irq = vmx_inject_irq, 7620 .set_nmi = vmx_inject_nmi, 7621 .queue_exception = vmx_queue_exception, 7622 .cancel_injection = vmx_cancel_injection, 7623 .interrupt_allowed = vmx_interrupt_allowed, 7624 .nmi_allowed = vmx_nmi_allowed, 7625 .get_nmi_mask = vmx_get_nmi_mask, 7626 .set_nmi_mask = vmx_set_nmi_mask, 7627 .enable_nmi_window = enable_nmi_window, 7628 .enable_irq_window = enable_irq_window, 7629 .update_cr8_intercept = update_cr8_intercept, 7630 .set_virtual_apic_mode = vmx_set_virtual_apic_mode, 7631 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 7632 .get_enable_apicv = vmx_get_enable_apicv, 7633 .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, 7634 .load_eoi_exitmap = vmx_load_eoi_exitmap, 7635 .apicv_post_state_restore = vmx_apicv_post_state_restore, 7636 .hwapic_irr_update = vmx_hwapic_irr_update, 7637 .hwapic_isr_update = vmx_hwapic_isr_update, 7638 .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, 7639 .sync_pir_to_irr = vmx_sync_pir_to_irr, 7640 .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 7641 7642 .set_tss_addr = vmx_set_tss_addr, 7643 .set_identity_map_addr = vmx_set_identity_map_addr, 7644 .get_tdp_level = get_ept_level, 7645 .get_mt_mask = vmx_get_mt_mask, 7646 7647 .get_exit_info = vmx_get_exit_info, 7648 7649 .get_lpage_level = vmx_get_lpage_level, 7650 7651 .cpuid_update = vmx_cpuid_update, 7652 7653 .rdtscp_supported = vmx_rdtscp_supported, 7654 .invpcid_supported = vmx_invpcid_supported, 7655 7656 .set_supported_cpuid = vmx_set_supported_cpuid, 7657 7658 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 7659 7660 .read_l1_tsc_offset = vmx_read_l1_tsc_offset, 7661 .write_l1_tsc_offset = vmx_write_l1_tsc_offset, 7662 7663 .set_tdp_cr3 = vmx_set_cr3, 7664 7665 .check_intercept = vmx_check_intercept, 7666 .handle_external_intr = vmx_handle_external_intr, 7667 .mpx_supported = vmx_mpx_supported, 7668 .xsaves_supported = vmx_xsaves_supported, 7669 .umip_emulated = vmx_umip_emulated, 7670 .pt_supported = vmx_pt_supported, 7671 7672 .request_immediate_exit = vmx_request_immediate_exit, 7673 7674 .sched_in = vmx_sched_in, 7675 7676 .slot_enable_log_dirty = vmx_slot_enable_log_dirty, 7677 .slot_disable_log_dirty = vmx_slot_disable_log_dirty, 7678 .flush_log_dirty = vmx_flush_log_dirty, 7679 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 7680 .write_log_dirty = vmx_write_pml_buffer, 7681 7682 .pre_block = vmx_pre_block, 7683 .post_block = vmx_post_block, 7684 7685 .pmu_ops = &intel_pmu_ops, 7686 7687 .update_pi_irte = vmx_update_pi_irte, 7688 7689 #ifdef CONFIG_X86_64 7690 .set_hv_timer = vmx_set_hv_timer, 7691 .cancel_hv_timer = vmx_cancel_hv_timer, 7692 #endif 7693 7694 .setup_mce = vmx_setup_mce, 7695 7696 .smi_allowed = vmx_smi_allowed, 7697 .pre_enter_smm = vmx_pre_enter_smm, 7698 .pre_leave_smm = vmx_pre_leave_smm, 7699 .enable_smi_window = enable_smi_window, 7700 7701 .check_nested_events = NULL, 7702 .get_nested_state = NULL, 7703 .set_nested_state = NULL, 7704 .get_vmcs12_pages = NULL, 7705 .nested_enable_evmcs = NULL, 7706 .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, 7707 }; 7708 7709 static void vmx_cleanup_l1d_flush(void) 7710 { 7711 if (vmx_l1d_flush_pages) { 7712 free_pages((unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER); 7713 vmx_l1d_flush_pages = NULL; 7714 } 7715 /* Restore state so sysfs ignores VMX */ 7716 l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; 7717 } 7718 7719 static void vmx_exit(void) 7720 { 7721 #ifdef CONFIG_KEXEC_CORE 7722 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); 7723 synchronize_rcu(); 7724 #endif 7725 7726 kvm_exit(); 7727 7728 #if IS_ENABLED(CONFIG_HYPERV) 7729 if (static_branch_unlikely(&enable_evmcs)) { 7730 int cpu; 7731 struct hv_vp_assist_page *vp_ap; 7732 /* 7733 * Reset everything to support using non-enlightened VMCS 7734 * access later (e.g. when we reload the module with 7735 * enlightened_vmcs=0) 7736 */ 7737 for_each_online_cpu(cpu) { 7738 vp_ap = hv_get_vp_assist_page(cpu); 7739 7740 if (!vp_ap) 7741 continue; 7742 7743 vp_ap->current_nested_vmcs = 0; 7744 vp_ap->enlighten_vmentry = 0; 7745 } 7746 7747 static_branch_disable(&enable_evmcs); 7748 } 7749 #endif 7750 vmx_cleanup_l1d_flush(); 7751 } 7752 module_exit(vmx_exit); 7753 7754 static int __init vmx_init(void) 7755 { 7756 int r; 7757 7758 #if IS_ENABLED(CONFIG_HYPERV) 7759 /* 7760 * Enlightened VMCS usage should be recommended and the host needs 7761 * to support eVMCS v1 or above. We can also disable eVMCS support 7762 * with module parameter. 7763 */ 7764 if (enlightened_vmcs && 7765 ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED && 7766 (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >= 7767 KVM_EVMCS_VERSION) { 7768 int cpu; 7769 7770 /* Check that we have assist pages on all online CPUs */ 7771 for_each_online_cpu(cpu) { 7772 if (!hv_get_vp_assist_page(cpu)) { 7773 enlightened_vmcs = false; 7774 break; 7775 } 7776 } 7777 7778 if (enlightened_vmcs) { 7779 pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n"); 7780 static_branch_enable(&enable_evmcs); 7781 } 7782 } else { 7783 enlightened_vmcs = false; 7784 } 7785 #endif 7786 7787 r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 7788 __alignof__(struct vcpu_vmx), THIS_MODULE); 7789 if (r) 7790 return r; 7791 7792 /* 7793 * Must be called after kvm_init() so enable_ept is properly set 7794 * up. Hand the parameter mitigation value in which was stored in 7795 * the pre module init parser. If no parameter was given, it will 7796 * contain 'auto' which will be turned into the default 'cond' 7797 * mitigation mode. 7798 */ 7799 if (boot_cpu_has(X86_BUG_L1TF)) { 7800 r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); 7801 if (r) { 7802 vmx_exit(); 7803 return r; 7804 } 7805 } 7806 7807 #ifdef CONFIG_KEXEC_CORE 7808 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 7809 crash_vmclear_local_loaded_vmcss); 7810 #endif 7811 vmx_check_vmcs12_offsets(); 7812 7813 return 0; 7814 } 7815 module_init(vmx_init); 7816