1 #define pr_fmt(fmt) "SVM: " fmt 2 3 #include <linux/kvm_host.h> 4 5 #include "irq.h" 6 #include "mmu.h" 7 #include "kvm_cache_regs.h" 8 #include "x86.h" 9 #include "cpuid.h" 10 #include "pmu.h" 11 12 #include <linux/module.h> 13 #include <linux/mod_devicetable.h> 14 #include <linux/kernel.h> 15 #include <linux/vmalloc.h> 16 #include <linux/highmem.h> 17 #include <linux/amd-iommu.h> 18 #include <linux/sched.h> 19 #include <linux/trace_events.h> 20 #include <linux/slab.h> 21 #include <linux/hashtable.h> 22 #include <linux/objtool.h> 23 #include <linux/psp-sev.h> 24 #include <linux/file.h> 25 #include <linux/pagemap.h> 26 #include <linux/swap.h> 27 #include <linux/rwsem.h> 28 #include <linux/cc_platform.h> 29 30 #include <asm/apic.h> 31 #include <asm/perf_event.h> 32 #include <asm/tlbflush.h> 33 #include <asm/desc.h> 34 #include <asm/debugreg.h> 35 #include <asm/kvm_para.h> 36 #include <asm/irq_remapping.h> 37 #include <asm/spec-ctrl.h> 38 #include <asm/cpu_device_id.h> 39 #include <asm/traps.h> 40 #include <asm/fpu/api.h> 41 42 #include <asm/virtext.h> 43 #include "trace.h" 44 45 #include "svm.h" 46 #include "svm_ops.h" 47 48 #include "kvm_onhyperv.h" 49 #include "svm_onhyperv.h" 50 51 MODULE_AUTHOR("Qumranet"); 52 MODULE_LICENSE("GPL"); 53 54 #ifdef MODULE 55 static const struct x86_cpu_id svm_cpu_id[] = { 56 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL), 57 {} 58 }; 59 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); 60 #endif 61 62 #define SEG_TYPE_LDT 2 63 #define SEG_TYPE_BUSY_TSS16 3 64 65 static bool erratum_383_found __read_mostly; 66 67 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 68 69 /* 70 * Set osvw_len to higher value when updated Revision Guides 71 * are published and we know what the new status bits are 72 */ 73 static uint64_t osvw_len = 4, osvw_status; 74 75 static DEFINE_PER_CPU(u64, current_tsc_ratio); 76 77 static const struct svm_direct_access_msrs { 78 u32 index; /* Index of the MSR */ 79 bool always; /* True if intercept is initially cleared */ 80 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { 81 { .index = MSR_STAR, .always = true }, 82 { .index = MSR_IA32_SYSENTER_CS, .always = true }, 83 { .index = MSR_IA32_SYSENTER_EIP, .always = false }, 84 { .index = MSR_IA32_SYSENTER_ESP, .always = false }, 85 #ifdef CONFIG_X86_64 86 { .index = MSR_GS_BASE, .always = true }, 87 { .index = MSR_FS_BASE, .always = true }, 88 { .index = MSR_KERNEL_GS_BASE, .always = true }, 89 { .index = MSR_LSTAR, .always = true }, 90 { .index = MSR_CSTAR, .always = true }, 91 { .index = MSR_SYSCALL_MASK, .always = true }, 92 #endif 93 { .index = MSR_IA32_SPEC_CTRL, .always = false }, 94 { .index = MSR_IA32_PRED_CMD, .always = false }, 95 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, 96 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, 97 { .index = MSR_IA32_LASTINTFROMIP, .always = false }, 98 { .index = MSR_IA32_LASTINTTOIP, .always = false }, 99 { .index = MSR_EFER, .always = false }, 100 { .index = MSR_IA32_CR_PAT, .always = false }, 101 { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, 102 { .index = MSR_TSC_AUX, .always = false }, 103 { .index = MSR_INVALID, .always = false }, 104 }; 105 106 /* 107 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 108 * pause_filter_count: On processors that support Pause filtering(indicated 109 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter 110 * count value. On VMRUN this value is loaded into an internal counter. 111 * Each time a pause instruction is executed, this counter is decremented 112 * until it reaches zero at which time a #VMEXIT is generated if pause 113 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause 114 * Intercept Filtering for more details. 115 * This also indicate if ple logic enabled. 116 * 117 * pause_filter_thresh: In addition, some processor families support advanced 118 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on 119 * the amount of time a guest is allowed to execute in a pause loop. 120 * In this mode, a 16-bit pause filter threshold field is added in the 121 * VMCB. The threshold value is a cycle count that is used to reset the 122 * pause counter. As with simple pause filtering, VMRUN loads the pause 123 * count value from VMCB into an internal counter. Then, on each pause 124 * instruction the hardware checks the elapsed number of cycles since 125 * the most recent pause instruction against the pause filter threshold. 126 * If the elapsed cycle count is greater than the pause filter threshold, 127 * then the internal pause count is reloaded from the VMCB and execution 128 * continues. If the elapsed cycle count is less than the pause filter 129 * threshold, then the internal pause count is decremented. If the count 130 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is 131 * triggered. If advanced pause filtering is supported and pause filter 132 * threshold field is set to zero, the filter will operate in the simpler, 133 * count only mode. 134 */ 135 136 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; 137 module_param(pause_filter_thresh, ushort, 0444); 138 139 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; 140 module_param(pause_filter_count, ushort, 0444); 141 142 /* Default doubles per-vcpu window every exit. */ 143 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 144 module_param(pause_filter_count_grow, ushort, 0444); 145 146 /* Default resets per-vcpu window every exit to pause_filter_count. */ 147 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 148 module_param(pause_filter_count_shrink, ushort, 0444); 149 150 /* Default is to compute the maximum so we can never overflow. */ 151 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; 152 module_param(pause_filter_count_max, ushort, 0444); 153 154 /* 155 * Use nested page tables by default. Note, NPT may get forced off by 156 * svm_hardware_setup() if it's unsupported by hardware or the host kernel. 157 */ 158 bool npt_enabled = true; 159 module_param_named(npt, npt_enabled, bool, 0444); 160 161 /* allow nested virtualization in KVM/SVM */ 162 static int nested = true; 163 module_param(nested, int, S_IRUGO); 164 165 /* enable/disable Next RIP Save */ 166 static int nrips = true; 167 module_param(nrips, int, 0444); 168 169 /* enable/disable Virtual VMLOAD VMSAVE */ 170 static int vls = true; 171 module_param(vls, int, 0444); 172 173 /* enable/disable Virtual GIF */ 174 int vgif = true; 175 module_param(vgif, int, 0444); 176 177 /* enable/disable LBR virtualization */ 178 static int lbrv = true; 179 module_param(lbrv, int, 0444); 180 181 static int tsc_scaling = true; 182 module_param(tsc_scaling, int, 0444); 183 184 /* 185 * enable / disable AVIC. Because the defaults differ for APICv 186 * support between VMX and SVM we cannot use module_param_named. 187 */ 188 static bool avic; 189 module_param(avic, bool, 0444); 190 191 static bool force_avic; 192 module_param_unsafe(force_avic, bool, 0444); 193 194 bool __read_mostly dump_invalid_vmcb; 195 module_param(dump_invalid_vmcb, bool, 0644); 196 197 198 bool intercept_smi = true; 199 module_param(intercept_smi, bool, 0444); 200 201 202 static bool svm_gp_erratum_intercept = true; 203 204 static u8 rsm_ins_bytes[] = "\x0f\xaa"; 205 206 static unsigned long iopm_base; 207 208 struct kvm_ldttss_desc { 209 u16 limit0; 210 u16 base0; 211 unsigned base1:8, type:5, dpl:2, p:1; 212 unsigned limit1:4, zero0:3, g:1, base2:8; 213 u32 base3; 214 u32 zero1; 215 } __attribute__((packed)); 216 217 DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 218 219 /* 220 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via 221 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. 222 * 223 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to 224 * defer the restoration of TSC_AUX until the CPU returns to userspace. 225 */ 226 static int tsc_aux_uret_slot __read_mostly = -1; 227 228 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 229 230 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 231 #define MSRS_RANGE_SIZE 2048 232 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 233 234 u32 svm_msrpm_offset(u32 msr) 235 { 236 u32 offset; 237 int i; 238 239 for (i = 0; i < NUM_MSR_MAPS; i++) { 240 if (msr < msrpm_ranges[i] || 241 msr >= msrpm_ranges[i] + MSRS_IN_RANGE) 242 continue; 243 244 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ 245 offset += (i * MSRS_RANGE_SIZE); /* add range offset */ 246 247 /* Now we have the u8 offset - but need the u32 offset */ 248 return offset / 4; 249 } 250 251 /* MSR not in any range */ 252 return MSR_INVALID; 253 } 254 255 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); 256 257 static int get_npt_level(void) 258 { 259 #ifdef CONFIG_X86_64 260 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; 261 #else 262 return PT32E_ROOT_LEVEL; 263 #endif 264 } 265 266 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 267 { 268 struct vcpu_svm *svm = to_svm(vcpu); 269 u64 old_efer = vcpu->arch.efer; 270 vcpu->arch.efer = efer; 271 272 if (!npt_enabled) { 273 /* Shadow paging assumes NX to be available. */ 274 efer |= EFER_NX; 275 276 if (!(efer & EFER_LMA)) 277 efer &= ~EFER_LME; 278 } 279 280 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { 281 if (!(efer & EFER_SVME)) { 282 svm_leave_nested(vcpu); 283 svm_set_gif(svm, true); 284 /* #GP intercept is still needed for vmware backdoor */ 285 if (!enable_vmware_backdoor) 286 clr_exception_intercept(svm, GP_VECTOR); 287 288 /* 289 * Free the nested guest state, unless we are in SMM. 290 * In this case we will return to the nested guest 291 * as soon as we leave SMM. 292 */ 293 if (!is_smm(vcpu)) 294 svm_free_nested(svm); 295 296 } else { 297 int ret = svm_allocate_nested(svm); 298 299 if (ret) { 300 vcpu->arch.efer = old_efer; 301 return ret; 302 } 303 304 /* 305 * Never intercept #GP for SEV guests, KVM can't 306 * decrypt guest memory to workaround the erratum. 307 */ 308 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) 309 set_exception_intercept(svm, GP_VECTOR); 310 } 311 } 312 313 svm->vmcb->save.efer = efer | EFER_SVME; 314 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 315 return 0; 316 } 317 318 static int is_external_interrupt(u32 info) 319 { 320 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 321 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); 322 } 323 324 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) 325 { 326 struct vcpu_svm *svm = to_svm(vcpu); 327 u32 ret = 0; 328 329 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 330 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; 331 return ret; 332 } 333 334 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 335 { 336 struct vcpu_svm *svm = to_svm(vcpu); 337 338 if (mask == 0) 339 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 340 else 341 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; 342 343 } 344 345 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) 346 { 347 struct vcpu_svm *svm = to_svm(vcpu); 348 349 /* 350 * SEV-ES does not expose the next RIP. The RIP update is controlled by 351 * the type of exit and the #VC handler in the guest. 352 */ 353 if (sev_es_guest(vcpu->kvm)) 354 goto done; 355 356 if (nrips && svm->vmcb->control.next_rip != 0) { 357 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); 358 svm->next_rip = svm->vmcb->control.next_rip; 359 } 360 361 if (!svm->next_rip) { 362 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 363 return 0; 364 } else { 365 kvm_rip_write(vcpu, svm->next_rip); 366 } 367 368 done: 369 svm_set_interrupt_shadow(vcpu, 0); 370 371 return 1; 372 } 373 374 static void svm_queue_exception(struct kvm_vcpu *vcpu) 375 { 376 struct vcpu_svm *svm = to_svm(vcpu); 377 unsigned nr = vcpu->arch.exception.nr; 378 bool has_error_code = vcpu->arch.exception.has_error_code; 379 u32 error_code = vcpu->arch.exception.error_code; 380 381 kvm_deliver_exception_payload(vcpu); 382 383 if (nr == BP_VECTOR && !nrips) { 384 unsigned long rip, old_rip = kvm_rip_read(vcpu); 385 386 /* 387 * For guest debugging where we have to reinject #BP if some 388 * INT3 is guest-owned: 389 * Emulate nRIP by moving RIP forward. Will fail if injection 390 * raises a fault that is not intercepted. Still better than 391 * failing in all cases. 392 */ 393 (void)svm_skip_emulated_instruction(vcpu); 394 rip = kvm_rip_read(vcpu); 395 svm->int3_rip = rip + svm->vmcb->save.cs.base; 396 svm->int3_injected = rip - old_rip; 397 } 398 399 svm->vmcb->control.event_inj = nr 400 | SVM_EVTINJ_VALID 401 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) 402 | SVM_EVTINJ_TYPE_EXEPT; 403 svm->vmcb->control.event_inj_err = error_code; 404 } 405 406 static void svm_init_erratum_383(void) 407 { 408 u32 low, high; 409 int err; 410 u64 val; 411 412 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) 413 return; 414 415 /* Use _safe variants to not break nested virtualization */ 416 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); 417 if (err) 418 return; 419 420 val |= (1ULL << 47); 421 422 low = lower_32_bits(val); 423 high = upper_32_bits(val); 424 425 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); 426 427 erratum_383_found = true; 428 } 429 430 static void svm_init_osvw(struct kvm_vcpu *vcpu) 431 { 432 /* 433 * Guests should see errata 400 and 415 as fixed (assuming that 434 * HLT and IO instructions are intercepted). 435 */ 436 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; 437 vcpu->arch.osvw.status = osvw_status & ~(6ULL); 438 439 /* 440 * By increasing VCPU's osvw.length to 3 we are telling the guest that 441 * all osvw.status bits inside that length, including bit 0 (which is 442 * reserved for erratum 298), are valid. However, if host processor's 443 * osvw_len is 0 then osvw_status[0] carries no information. We need to 444 * be conservative here and therefore we tell the guest that erratum 298 445 * is present (because we really don't know). 446 */ 447 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) 448 vcpu->arch.osvw.status |= 1; 449 } 450 451 static int has_svm(void) 452 { 453 const char *msg; 454 455 if (!cpu_has_svm(&msg)) { 456 printk(KERN_INFO "has_svm: %s\n", msg); 457 return 0; 458 } 459 460 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { 461 pr_info("KVM is unsupported when running as an SEV guest\n"); 462 return 0; 463 } 464 465 return 1; 466 } 467 468 void __svm_write_tsc_multiplier(u64 multiplier) 469 { 470 preempt_disable(); 471 472 if (multiplier == __this_cpu_read(current_tsc_ratio)) 473 goto out; 474 475 wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); 476 __this_cpu_write(current_tsc_ratio, multiplier); 477 out: 478 preempt_enable(); 479 } 480 481 static void svm_hardware_disable(void) 482 { 483 /* Make sure we clean up behind us */ 484 if (tsc_scaling) 485 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 486 487 cpu_svm_disable(); 488 489 amd_pmu_disable_virt(); 490 } 491 492 static int svm_hardware_enable(void) 493 { 494 495 struct svm_cpu_data *sd; 496 uint64_t efer; 497 struct desc_struct *gdt; 498 int me = raw_smp_processor_id(); 499 500 rdmsrl(MSR_EFER, efer); 501 if (efer & EFER_SVME) 502 return -EBUSY; 503 504 if (!has_svm()) { 505 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); 506 return -EINVAL; 507 } 508 sd = per_cpu(svm_data, me); 509 if (!sd) { 510 pr_err("%s: svm_data is NULL on %d\n", __func__, me); 511 return -EINVAL; 512 } 513 514 sd->asid_generation = 1; 515 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 516 sd->next_asid = sd->max_asid + 1; 517 sd->min_asid = max_sev_asid + 1; 518 519 gdt = get_current_gdt_rw(); 520 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 521 522 wrmsrl(MSR_EFER, efer | EFER_SVME); 523 524 wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); 525 526 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 527 /* 528 * Set the default value, even if we don't use TSC scaling 529 * to avoid having stale value in the msr 530 */ 531 __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); 532 } 533 534 535 /* 536 * Get OSVW bits. 537 * 538 * Note that it is possible to have a system with mixed processor 539 * revisions and therefore different OSVW bits. If bits are not the same 540 * on different processors then choose the worst case (i.e. if erratum 541 * is present on one processor and not on another then assume that the 542 * erratum is present everywhere). 543 */ 544 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { 545 uint64_t len, status = 0; 546 int err; 547 548 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); 549 if (!err) 550 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, 551 &err); 552 553 if (err) 554 osvw_status = osvw_len = 0; 555 else { 556 if (len < osvw_len) 557 osvw_len = len; 558 osvw_status |= status; 559 osvw_status &= (1ULL << osvw_len) - 1; 560 } 561 } else 562 osvw_status = osvw_len = 0; 563 564 svm_init_erratum_383(); 565 566 amd_pmu_enable_virt(); 567 568 return 0; 569 } 570 571 static void svm_cpu_uninit(int cpu) 572 { 573 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 574 575 if (!sd) 576 return; 577 578 per_cpu(svm_data, cpu) = NULL; 579 kfree(sd->sev_vmcbs); 580 __free_page(sd->save_area); 581 kfree(sd); 582 } 583 584 static int svm_cpu_init(int cpu) 585 { 586 struct svm_cpu_data *sd; 587 int ret = -ENOMEM; 588 589 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 590 if (!sd) 591 return ret; 592 sd->cpu = cpu; 593 sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); 594 if (!sd->save_area) 595 goto free_cpu_data; 596 597 ret = sev_cpu_init(sd); 598 if (ret) 599 goto free_save_area; 600 601 per_cpu(svm_data, cpu) = sd; 602 603 return 0; 604 605 free_save_area: 606 __free_page(sd->save_area); 607 free_cpu_data: 608 kfree(sd); 609 return ret; 610 611 } 612 613 static int direct_access_msr_slot(u32 msr) 614 { 615 u32 i; 616 617 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) 618 if (direct_access_msrs[i].index == msr) 619 return i; 620 621 return -ENOENT; 622 } 623 624 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, 625 int write) 626 { 627 struct vcpu_svm *svm = to_svm(vcpu); 628 int slot = direct_access_msr_slot(msr); 629 630 if (slot == -ENOENT) 631 return; 632 633 /* Set the shadow bitmaps to the desired intercept states */ 634 if (read) 635 set_bit(slot, svm->shadow_msr_intercept.read); 636 else 637 clear_bit(slot, svm->shadow_msr_intercept.read); 638 639 if (write) 640 set_bit(slot, svm->shadow_msr_intercept.write); 641 else 642 clear_bit(slot, svm->shadow_msr_intercept.write); 643 } 644 645 static bool valid_msr_intercept(u32 index) 646 { 647 return direct_access_msr_slot(index) != -ENOENT; 648 } 649 650 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) 651 { 652 u8 bit_write; 653 unsigned long tmp; 654 u32 offset; 655 u32 *msrpm; 656 657 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: 658 to_svm(vcpu)->msrpm; 659 660 offset = svm_msrpm_offset(msr); 661 bit_write = 2 * (msr & 0x0f) + 1; 662 tmp = msrpm[offset]; 663 664 BUG_ON(offset == MSR_INVALID); 665 666 return !!test_bit(bit_write, &tmp); 667 } 668 669 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, 670 u32 msr, int read, int write) 671 { 672 struct vcpu_svm *svm = to_svm(vcpu); 673 u8 bit_read, bit_write; 674 unsigned long tmp; 675 u32 offset; 676 677 /* 678 * If this warning triggers extend the direct_access_msrs list at the 679 * beginning of the file 680 */ 681 WARN_ON(!valid_msr_intercept(msr)); 682 683 /* Enforce non allowed MSRs to trap */ 684 if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 685 read = 0; 686 687 if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 688 write = 0; 689 690 offset = svm_msrpm_offset(msr); 691 bit_read = 2 * (msr & 0x0f); 692 bit_write = 2 * (msr & 0x0f) + 1; 693 tmp = msrpm[offset]; 694 695 BUG_ON(offset == MSR_INVALID); 696 697 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); 698 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); 699 700 msrpm[offset] = tmp; 701 702 svm_hv_vmcb_dirty_nested_enlightenments(vcpu); 703 svm->nested.force_msr_bitmap_recalc = true; 704 } 705 706 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, 707 int read, int write) 708 { 709 set_shadow_msr_intercept(vcpu, msr, read, write); 710 set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); 711 } 712 713 u32 *svm_vcpu_alloc_msrpm(void) 714 { 715 unsigned int order = get_order(MSRPM_SIZE); 716 struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); 717 u32 *msrpm; 718 719 if (!pages) 720 return NULL; 721 722 msrpm = page_address(pages); 723 memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); 724 725 return msrpm; 726 } 727 728 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) 729 { 730 int i; 731 732 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 733 if (!direct_access_msrs[i].always) 734 continue; 735 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); 736 } 737 } 738 739 740 void svm_vcpu_free_msrpm(u32 *msrpm) 741 { 742 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); 743 } 744 745 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) 746 { 747 struct vcpu_svm *svm = to_svm(vcpu); 748 u32 i; 749 750 /* 751 * Set intercept permissions for all direct access MSRs again. They 752 * will automatically get filtered through the MSR filter, so we are 753 * back in sync after this. 754 */ 755 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 756 u32 msr = direct_access_msrs[i].index; 757 u32 read = test_bit(i, svm->shadow_msr_intercept.read); 758 u32 write = test_bit(i, svm->shadow_msr_intercept.write); 759 760 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); 761 } 762 } 763 764 static void add_msr_offset(u32 offset) 765 { 766 int i; 767 768 for (i = 0; i < MSRPM_OFFSETS; ++i) { 769 770 /* Offset already in list? */ 771 if (msrpm_offsets[i] == offset) 772 return; 773 774 /* Slot used by another offset? */ 775 if (msrpm_offsets[i] != MSR_INVALID) 776 continue; 777 778 /* Add offset to list */ 779 msrpm_offsets[i] = offset; 780 781 return; 782 } 783 784 /* 785 * If this BUG triggers the msrpm_offsets table has an overflow. Just 786 * increase MSRPM_OFFSETS in this case. 787 */ 788 BUG(); 789 } 790 791 static void init_msrpm_offsets(void) 792 { 793 int i; 794 795 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); 796 797 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 798 u32 offset; 799 800 offset = svm_msrpm_offset(direct_access_msrs[i].index); 801 BUG_ON(offset == MSR_INVALID); 802 803 add_msr_offset(offset); 804 } 805 } 806 807 void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) 808 { 809 to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; 810 to_vmcb->save.br_from = from_vmcb->save.br_from; 811 to_vmcb->save.br_to = from_vmcb->save.br_to; 812 to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; 813 to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; 814 815 vmcb_mark_dirty(to_vmcb, VMCB_LBR); 816 } 817 818 static void svm_enable_lbrv(struct kvm_vcpu *vcpu) 819 { 820 struct vcpu_svm *svm = to_svm(vcpu); 821 822 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 823 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); 824 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); 825 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); 826 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); 827 828 /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ 829 if (is_guest_mode(vcpu)) 830 svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr); 831 } 832 833 static void svm_disable_lbrv(struct kvm_vcpu *vcpu) 834 { 835 struct vcpu_svm *svm = to_svm(vcpu); 836 837 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 838 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); 839 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); 840 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); 841 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); 842 843 /* 844 * Move the LBR msrs back to the vmcb01 to avoid copying them 845 * on nested guest entries. 846 */ 847 if (is_guest_mode(vcpu)) 848 svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb); 849 } 850 851 static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index) 852 { 853 /* 854 * If the LBR virtualization is disabled, the LBR msrs are always 855 * kept in the vmcb01 to avoid copying them on nested guest entries. 856 * 857 * If nested, and the LBR virtualization is enabled/disabled, the msrs 858 * are moved between the vmcb01 and vmcb02 as needed. 859 */ 860 struct vmcb *vmcb = 861 (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ? 862 svm->vmcb : svm->vmcb01.ptr; 863 864 switch (index) { 865 case MSR_IA32_DEBUGCTLMSR: 866 return vmcb->save.dbgctl; 867 case MSR_IA32_LASTBRANCHFROMIP: 868 return vmcb->save.br_from; 869 case MSR_IA32_LASTBRANCHTOIP: 870 return vmcb->save.br_to; 871 case MSR_IA32_LASTINTFROMIP: 872 return vmcb->save.last_excp_from; 873 case MSR_IA32_LASTINTTOIP: 874 return vmcb->save.last_excp_to; 875 default: 876 KVM_BUG(false, svm->vcpu.kvm, 877 "%s: Unknown MSR 0x%x", __func__, index); 878 return 0; 879 } 880 } 881 882 void svm_update_lbrv(struct kvm_vcpu *vcpu) 883 { 884 struct vcpu_svm *svm = to_svm(vcpu); 885 886 bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) & 887 DEBUGCTLMSR_LBR; 888 889 bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext & 890 LBR_CTL_ENABLE_MASK); 891 892 if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled)) 893 if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)) 894 enable_lbrv = true; 895 896 if (enable_lbrv == current_enable_lbrv) 897 return; 898 899 if (enable_lbrv) 900 svm_enable_lbrv(vcpu); 901 else 902 svm_disable_lbrv(vcpu); 903 } 904 905 void disable_nmi_singlestep(struct vcpu_svm *svm) 906 { 907 svm->nmi_singlestep = false; 908 909 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 910 /* Clear our flags if they were not set by the guest */ 911 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 912 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; 913 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 914 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; 915 } 916 } 917 918 static void grow_ple_window(struct kvm_vcpu *vcpu) 919 { 920 struct vcpu_svm *svm = to_svm(vcpu); 921 struct vmcb_control_area *control = &svm->vmcb->control; 922 int old = control->pause_filter_count; 923 924 if (kvm_pause_in_guest(vcpu->kvm)) 925 return; 926 927 control->pause_filter_count = __grow_ple_window(old, 928 pause_filter_count, 929 pause_filter_count_grow, 930 pause_filter_count_max); 931 932 if (control->pause_filter_count != old) { 933 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 934 trace_kvm_ple_window_update(vcpu->vcpu_id, 935 control->pause_filter_count, old); 936 } 937 } 938 939 static void shrink_ple_window(struct kvm_vcpu *vcpu) 940 { 941 struct vcpu_svm *svm = to_svm(vcpu); 942 struct vmcb_control_area *control = &svm->vmcb->control; 943 int old = control->pause_filter_count; 944 945 if (kvm_pause_in_guest(vcpu->kvm)) 946 return; 947 948 control->pause_filter_count = 949 __shrink_ple_window(old, 950 pause_filter_count, 951 pause_filter_count_shrink, 952 pause_filter_count); 953 if (control->pause_filter_count != old) { 954 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 955 trace_kvm_ple_window_update(vcpu->vcpu_id, 956 control->pause_filter_count, old); 957 } 958 } 959 960 static void svm_hardware_unsetup(void) 961 { 962 int cpu; 963 964 sev_hardware_unsetup(); 965 966 for_each_possible_cpu(cpu) 967 svm_cpu_uninit(cpu); 968 969 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), 970 get_order(IOPM_SIZE)); 971 iopm_base = 0; 972 } 973 974 static void init_seg(struct vmcb_seg *seg) 975 { 976 seg->selector = 0; 977 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 978 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 979 seg->limit = 0xffff; 980 seg->base = 0; 981 } 982 983 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) 984 { 985 seg->selector = 0; 986 seg->attrib = SVM_SELECTOR_P_MASK | type; 987 seg->limit = 0xffff; 988 seg->base = 0; 989 } 990 991 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 992 { 993 struct vcpu_svm *svm = to_svm(vcpu); 994 995 return svm->nested.ctl.tsc_offset; 996 } 997 998 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 999 { 1000 struct vcpu_svm *svm = to_svm(vcpu); 1001 1002 return svm->tsc_ratio_msr; 1003 } 1004 1005 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1006 { 1007 struct vcpu_svm *svm = to_svm(vcpu); 1008 1009 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; 1010 svm->vmcb->control.tsc_offset = offset; 1011 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1012 } 1013 1014 static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) 1015 { 1016 __svm_write_tsc_multiplier(multiplier); 1017 } 1018 1019 1020 /* Evaluate instruction intercepts that depend on guest CPUID features. */ 1021 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, 1022 struct vcpu_svm *svm) 1023 { 1024 /* 1025 * Intercept INVPCID if shadow paging is enabled to sync/free shadow 1026 * roots, or if INVPCID is disabled in the guest to inject #UD. 1027 */ 1028 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { 1029 if (!npt_enabled || 1030 !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) 1031 svm_set_intercept(svm, INTERCEPT_INVPCID); 1032 else 1033 svm_clr_intercept(svm, INTERCEPT_INVPCID); 1034 } 1035 1036 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { 1037 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 1038 svm_clr_intercept(svm, INTERCEPT_RDTSCP); 1039 else 1040 svm_set_intercept(svm, INTERCEPT_RDTSCP); 1041 } 1042 } 1043 1044 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) 1045 { 1046 struct vcpu_svm *svm = to_svm(vcpu); 1047 1048 if (guest_cpuid_is_intel(vcpu)) { 1049 /* 1050 * We must intercept SYSENTER_EIP and SYSENTER_ESP 1051 * accesses because the processor only stores 32 bits. 1052 * For the same reason we cannot use virtual VMLOAD/VMSAVE. 1053 */ 1054 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1055 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1056 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1057 1058 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); 1059 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); 1060 1061 svm->v_vmload_vmsave_enabled = false; 1062 } else { 1063 /* 1064 * If hardware supports Virtual VMLOAD VMSAVE then enable it 1065 * in VMCB and clear intercepts to avoid #VMEXIT. 1066 */ 1067 if (vls) { 1068 svm_clr_intercept(svm, INTERCEPT_VMLOAD); 1069 svm_clr_intercept(svm, INTERCEPT_VMSAVE); 1070 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1071 } 1072 /* No need to intercept these MSRs */ 1073 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); 1074 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); 1075 } 1076 } 1077 1078 static void init_vmcb(struct kvm_vcpu *vcpu) 1079 { 1080 struct vcpu_svm *svm = to_svm(vcpu); 1081 struct vmcb *vmcb = svm->vmcb01.ptr; 1082 struct vmcb_control_area *control = &vmcb->control; 1083 struct vmcb_save_area *save = &vmcb->save; 1084 1085 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1086 svm_set_intercept(svm, INTERCEPT_CR3_READ); 1087 svm_set_intercept(svm, INTERCEPT_CR4_READ); 1088 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1089 svm_set_intercept(svm, INTERCEPT_CR3_WRITE); 1090 svm_set_intercept(svm, INTERCEPT_CR4_WRITE); 1091 if (!kvm_vcpu_apicv_active(vcpu)) 1092 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 1093 1094 set_dr_intercepts(svm); 1095 1096 set_exception_intercept(svm, PF_VECTOR); 1097 set_exception_intercept(svm, UD_VECTOR); 1098 set_exception_intercept(svm, MC_VECTOR); 1099 set_exception_intercept(svm, AC_VECTOR); 1100 set_exception_intercept(svm, DB_VECTOR); 1101 /* 1102 * Guest access to VMware backdoor ports could legitimately 1103 * trigger #GP because of TSS I/O permission bitmap. 1104 * We intercept those #GP and allow access to them anyway 1105 * as VMware does. Don't intercept #GP for SEV guests as KVM can't 1106 * decrypt guest memory to decode the faulting instruction. 1107 */ 1108 if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) 1109 set_exception_intercept(svm, GP_VECTOR); 1110 1111 svm_set_intercept(svm, INTERCEPT_INTR); 1112 svm_set_intercept(svm, INTERCEPT_NMI); 1113 1114 if (intercept_smi) 1115 svm_set_intercept(svm, INTERCEPT_SMI); 1116 1117 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1118 svm_set_intercept(svm, INTERCEPT_RDPMC); 1119 svm_set_intercept(svm, INTERCEPT_CPUID); 1120 svm_set_intercept(svm, INTERCEPT_INVD); 1121 svm_set_intercept(svm, INTERCEPT_INVLPG); 1122 svm_set_intercept(svm, INTERCEPT_INVLPGA); 1123 svm_set_intercept(svm, INTERCEPT_IOIO_PROT); 1124 svm_set_intercept(svm, INTERCEPT_MSR_PROT); 1125 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH); 1126 svm_set_intercept(svm, INTERCEPT_SHUTDOWN); 1127 svm_set_intercept(svm, INTERCEPT_VMRUN); 1128 svm_set_intercept(svm, INTERCEPT_VMMCALL); 1129 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1130 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1131 svm_set_intercept(svm, INTERCEPT_STGI); 1132 svm_set_intercept(svm, INTERCEPT_CLGI); 1133 svm_set_intercept(svm, INTERCEPT_SKINIT); 1134 svm_set_intercept(svm, INTERCEPT_WBINVD); 1135 svm_set_intercept(svm, INTERCEPT_XSETBV); 1136 svm_set_intercept(svm, INTERCEPT_RDPRU); 1137 svm_set_intercept(svm, INTERCEPT_RSM); 1138 1139 if (!kvm_mwait_in_guest(vcpu->kvm)) { 1140 svm_set_intercept(svm, INTERCEPT_MONITOR); 1141 svm_set_intercept(svm, INTERCEPT_MWAIT); 1142 } 1143 1144 if (!kvm_hlt_in_guest(vcpu->kvm)) 1145 svm_set_intercept(svm, INTERCEPT_HLT); 1146 1147 control->iopm_base_pa = __sme_set(iopm_base); 1148 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1149 control->int_ctl = V_INTR_MASKING_MASK; 1150 1151 init_seg(&save->es); 1152 init_seg(&save->ss); 1153 init_seg(&save->ds); 1154 init_seg(&save->fs); 1155 init_seg(&save->gs); 1156 1157 save->cs.selector = 0xf000; 1158 save->cs.base = 0xffff0000; 1159 /* Executable/Readable Code Segment */ 1160 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1161 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1162 save->cs.limit = 0xffff; 1163 1164 save->gdtr.base = 0; 1165 save->gdtr.limit = 0xffff; 1166 save->idtr.base = 0; 1167 save->idtr.limit = 0xffff; 1168 1169 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1170 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1171 1172 if (npt_enabled) { 1173 /* Setup VMCB for Nested Paging */ 1174 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 1175 svm_clr_intercept(svm, INTERCEPT_INVLPG); 1176 clr_exception_intercept(svm, PF_VECTOR); 1177 svm_clr_intercept(svm, INTERCEPT_CR3_READ); 1178 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); 1179 save->g_pat = vcpu->arch.pat; 1180 save->cr3 = 0; 1181 } 1182 svm->current_vmcb->asid_generation = 0; 1183 svm->asid = 0; 1184 1185 svm->nested.vmcb12_gpa = INVALID_GPA; 1186 svm->nested.last_vmcb12_gpa = INVALID_GPA; 1187 1188 if (!kvm_pause_in_guest(vcpu->kvm)) { 1189 control->pause_filter_count = pause_filter_count; 1190 if (pause_filter_thresh) 1191 control->pause_filter_thresh = pause_filter_thresh; 1192 svm_set_intercept(svm, INTERCEPT_PAUSE); 1193 } else { 1194 svm_clr_intercept(svm, INTERCEPT_PAUSE); 1195 } 1196 1197 svm_recalc_instruction_intercepts(vcpu, svm); 1198 1199 /* 1200 * If the host supports V_SPEC_CTRL then disable the interception 1201 * of MSR_IA32_SPEC_CTRL. 1202 */ 1203 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 1204 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 1205 1206 if (kvm_vcpu_apicv_active(vcpu)) 1207 avic_init_vmcb(svm, vmcb); 1208 1209 if (vgif) { 1210 svm_clr_intercept(svm, INTERCEPT_STGI); 1211 svm_clr_intercept(svm, INTERCEPT_CLGI); 1212 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1213 } 1214 1215 if (sev_guest(vcpu->kvm)) 1216 sev_init_vmcb(svm); 1217 1218 svm_hv_init_vmcb(vmcb); 1219 init_vmcb_after_set_cpuid(vcpu); 1220 1221 vmcb_mark_all_dirty(vmcb); 1222 1223 enable_gif(svm); 1224 } 1225 1226 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) 1227 { 1228 struct vcpu_svm *svm = to_svm(vcpu); 1229 1230 svm_vcpu_init_msrpm(vcpu, svm->msrpm); 1231 1232 svm_init_osvw(vcpu); 1233 vcpu->arch.microcode_version = 0x01000065; 1234 svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; 1235 1236 if (sev_es_guest(vcpu->kvm)) 1237 sev_es_vcpu_reset(svm); 1238 } 1239 1240 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 1241 { 1242 struct vcpu_svm *svm = to_svm(vcpu); 1243 1244 svm->spec_ctrl = 0; 1245 svm->virt_spec_ctrl = 0; 1246 1247 init_vmcb(vcpu); 1248 1249 if (!init_event) 1250 __svm_vcpu_reset(vcpu); 1251 } 1252 1253 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) 1254 { 1255 svm->current_vmcb = target_vmcb; 1256 svm->vmcb = target_vmcb->ptr; 1257 } 1258 1259 static int svm_vcpu_create(struct kvm_vcpu *vcpu) 1260 { 1261 struct vcpu_svm *svm; 1262 struct page *vmcb01_page; 1263 struct page *vmsa_page = NULL; 1264 int err; 1265 1266 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); 1267 svm = to_svm(vcpu); 1268 1269 err = -ENOMEM; 1270 vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1271 if (!vmcb01_page) 1272 goto out; 1273 1274 if (sev_es_guest(vcpu->kvm)) { 1275 /* 1276 * SEV-ES guests require a separate VMSA page used to contain 1277 * the encrypted register state of the guest. 1278 */ 1279 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1280 if (!vmsa_page) 1281 goto error_free_vmcb_page; 1282 1283 /* 1284 * SEV-ES guests maintain an encrypted version of their FPU 1285 * state which is restored and saved on VMRUN and VMEXIT. 1286 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't 1287 * do xsave/xrstor on it. 1288 */ 1289 fpstate_set_confidential(&vcpu->arch.guest_fpu); 1290 } 1291 1292 err = avic_init_vcpu(svm); 1293 if (err) 1294 goto error_free_vmsa_page; 1295 1296 svm->msrpm = svm_vcpu_alloc_msrpm(); 1297 if (!svm->msrpm) { 1298 err = -ENOMEM; 1299 goto error_free_vmsa_page; 1300 } 1301 1302 svm->vmcb01.ptr = page_address(vmcb01_page); 1303 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); 1304 svm_switch_vmcb(svm, &svm->vmcb01); 1305 1306 if (vmsa_page) 1307 svm->sev_es.vmsa = page_address(vmsa_page); 1308 1309 svm->guest_state_loaded = false; 1310 1311 return 0; 1312 1313 error_free_vmsa_page: 1314 if (vmsa_page) 1315 __free_page(vmsa_page); 1316 error_free_vmcb_page: 1317 __free_page(vmcb01_page); 1318 out: 1319 return err; 1320 } 1321 1322 static void svm_clear_current_vmcb(struct vmcb *vmcb) 1323 { 1324 int i; 1325 1326 for_each_online_cpu(i) 1327 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); 1328 } 1329 1330 static void svm_vcpu_free(struct kvm_vcpu *vcpu) 1331 { 1332 struct vcpu_svm *svm = to_svm(vcpu); 1333 1334 /* 1335 * The vmcb page can be recycled, causing a false negative in 1336 * svm_vcpu_load(). So, ensure that no logical CPU has this 1337 * vmcb page recorded as its current vmcb. 1338 */ 1339 svm_clear_current_vmcb(svm->vmcb); 1340 1341 svm_free_nested(svm); 1342 1343 sev_free_vcpu(vcpu); 1344 1345 __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); 1346 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); 1347 } 1348 1349 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1350 { 1351 struct vcpu_svm *svm = to_svm(vcpu); 1352 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 1353 1354 if (sev_es_guest(vcpu->kvm)) 1355 sev_es_unmap_ghcb(svm); 1356 1357 if (svm->guest_state_loaded) 1358 return; 1359 1360 /* 1361 * Save additional host state that will be restored on VMEXIT (sev-es) 1362 * or subsequent vmload of host save area. 1363 */ 1364 vmsave(__sme_page_pa(sd->save_area)); 1365 if (sev_es_guest(vcpu->kvm)) { 1366 struct sev_es_save_area *hostsa; 1367 hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); 1368 1369 sev_es_prepare_switch_to_guest(hostsa); 1370 } 1371 1372 if (tsc_scaling) 1373 __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio); 1374 1375 if (likely(tsc_aux_uret_slot >= 0)) 1376 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); 1377 1378 svm->guest_state_loaded = true; 1379 } 1380 1381 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) 1382 { 1383 to_svm(vcpu)->guest_state_loaded = false; 1384 } 1385 1386 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1387 { 1388 struct vcpu_svm *svm = to_svm(vcpu); 1389 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 1390 1391 if (sd->current_vmcb != svm->vmcb) { 1392 sd->current_vmcb = svm->vmcb; 1393 indirect_branch_prediction_barrier(); 1394 } 1395 if (kvm_vcpu_apicv_active(vcpu)) 1396 avic_vcpu_load(vcpu, cpu); 1397 } 1398 1399 static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1400 { 1401 if (kvm_vcpu_apicv_active(vcpu)) 1402 avic_vcpu_put(vcpu); 1403 1404 svm_prepare_host_switch(vcpu); 1405 1406 ++vcpu->stat.host_state_reload; 1407 } 1408 1409 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1410 { 1411 struct vcpu_svm *svm = to_svm(vcpu); 1412 unsigned long rflags = svm->vmcb->save.rflags; 1413 1414 if (svm->nmi_singlestep) { 1415 /* Hide our flags if they were not set by the guest */ 1416 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 1417 rflags &= ~X86_EFLAGS_TF; 1418 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 1419 rflags &= ~X86_EFLAGS_RF; 1420 } 1421 return rflags; 1422 } 1423 1424 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1425 { 1426 if (to_svm(vcpu)->nmi_singlestep) 1427 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 1428 1429 /* 1430 * Any change of EFLAGS.VM is accompanied by a reload of SS 1431 * (caused by either a task switch or an inter-privilege IRET), 1432 * so we do not need to update the CPL here. 1433 */ 1434 to_svm(vcpu)->vmcb->save.rflags = rflags; 1435 } 1436 1437 static bool svm_get_if_flag(struct kvm_vcpu *vcpu) 1438 { 1439 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1440 1441 return sev_es_guest(vcpu->kvm) 1442 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK 1443 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; 1444 } 1445 1446 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1447 { 1448 kvm_register_mark_available(vcpu, reg); 1449 1450 switch (reg) { 1451 case VCPU_EXREG_PDPTR: 1452 /* 1453 * When !npt_enabled, mmu->pdptrs[] is already available since 1454 * it is always updated per SDM when moving to CRs. 1455 */ 1456 if (npt_enabled) 1457 load_pdptrs(vcpu, kvm_read_cr3(vcpu)); 1458 break; 1459 default: 1460 KVM_BUG_ON(1, vcpu->kvm); 1461 } 1462 } 1463 1464 static void svm_set_vintr(struct vcpu_svm *svm) 1465 { 1466 struct vmcb_control_area *control; 1467 1468 /* 1469 * The following fields are ignored when AVIC is enabled 1470 */ 1471 WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu)); 1472 1473 svm_set_intercept(svm, INTERCEPT_VINTR); 1474 1475 /* 1476 * This is just a dummy VINTR to actually cause a vmexit to happen. 1477 * Actual injection of virtual interrupts happens through EVENTINJ. 1478 */ 1479 control = &svm->vmcb->control; 1480 control->int_vector = 0x0; 1481 control->int_ctl &= ~V_INTR_PRIO_MASK; 1482 control->int_ctl |= V_IRQ_MASK | 1483 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 1484 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1485 } 1486 1487 static void svm_clear_vintr(struct vcpu_svm *svm) 1488 { 1489 svm_clr_intercept(svm, INTERCEPT_VINTR); 1490 1491 /* Drop int_ctl fields related to VINTR injection. */ 1492 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1493 if (is_guest_mode(&svm->vcpu)) { 1494 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1495 1496 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != 1497 (svm->nested.ctl.int_ctl & V_TPR_MASK)); 1498 1499 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & 1500 V_IRQ_INJECTION_BITS_MASK; 1501 1502 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; 1503 } 1504 1505 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1506 } 1507 1508 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1509 { 1510 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1511 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save; 1512 1513 switch (seg) { 1514 case VCPU_SREG_CS: return &save->cs; 1515 case VCPU_SREG_DS: return &save->ds; 1516 case VCPU_SREG_ES: return &save->es; 1517 case VCPU_SREG_FS: return &save01->fs; 1518 case VCPU_SREG_GS: return &save01->gs; 1519 case VCPU_SREG_SS: return &save->ss; 1520 case VCPU_SREG_TR: return &save01->tr; 1521 case VCPU_SREG_LDTR: return &save01->ldtr; 1522 } 1523 BUG(); 1524 return NULL; 1525 } 1526 1527 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) 1528 { 1529 struct vmcb_seg *s = svm_seg(vcpu, seg); 1530 1531 return s->base; 1532 } 1533 1534 static void svm_get_segment(struct kvm_vcpu *vcpu, 1535 struct kvm_segment *var, int seg) 1536 { 1537 struct vmcb_seg *s = svm_seg(vcpu, seg); 1538 1539 var->base = s->base; 1540 var->limit = s->limit; 1541 var->selector = s->selector; 1542 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; 1543 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; 1544 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; 1545 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; 1546 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; 1547 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 1548 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1549 1550 /* 1551 * AMD CPUs circa 2014 track the G bit for all segments except CS. 1552 * However, the SVM spec states that the G bit is not observed by the 1553 * CPU, and some VMware virtual CPUs drop the G bit for all segments. 1554 * So let's synthesize a legal G bit for all segments, this helps 1555 * running KVM nested. It also helps cross-vendor migration, because 1556 * Intel's vmentry has a check on the 'G' bit. 1557 */ 1558 var->g = s->limit > 0xfffff; 1559 1560 /* 1561 * AMD's VMCB does not have an explicit unusable field, so emulate it 1562 * for cross vendor migration purposes by "not present" 1563 */ 1564 var->unusable = !var->present; 1565 1566 switch (seg) { 1567 case VCPU_SREG_TR: 1568 /* 1569 * Work around a bug where the busy flag in the tr selector 1570 * isn't exposed 1571 */ 1572 var->type |= 0x2; 1573 break; 1574 case VCPU_SREG_DS: 1575 case VCPU_SREG_ES: 1576 case VCPU_SREG_FS: 1577 case VCPU_SREG_GS: 1578 /* 1579 * The accessed bit must always be set in the segment 1580 * descriptor cache, although it can be cleared in the 1581 * descriptor, the cached bit always remains at 1. Since 1582 * Intel has a check on this, set it here to support 1583 * cross-vendor migration. 1584 */ 1585 if (!var->unusable) 1586 var->type |= 0x1; 1587 break; 1588 case VCPU_SREG_SS: 1589 /* 1590 * On AMD CPUs sometimes the DB bit in the segment 1591 * descriptor is left as 1, although the whole segment has 1592 * been made unusable. Clear it here to pass an Intel VMX 1593 * entry check when cross vendor migrating. 1594 */ 1595 if (var->unusable) 1596 var->db = 0; 1597 /* This is symmetric with svm_set_segment() */ 1598 var->dpl = to_svm(vcpu)->vmcb->save.cpl; 1599 break; 1600 } 1601 } 1602 1603 static int svm_get_cpl(struct kvm_vcpu *vcpu) 1604 { 1605 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1606 1607 return save->cpl; 1608 } 1609 1610 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 1611 { 1612 struct kvm_segment cs; 1613 1614 svm_get_segment(vcpu, &cs, VCPU_SREG_CS); 1615 *db = cs.db; 1616 *l = cs.l; 1617 } 1618 1619 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1620 { 1621 struct vcpu_svm *svm = to_svm(vcpu); 1622 1623 dt->size = svm->vmcb->save.idtr.limit; 1624 dt->address = svm->vmcb->save.idtr.base; 1625 } 1626 1627 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1628 { 1629 struct vcpu_svm *svm = to_svm(vcpu); 1630 1631 svm->vmcb->save.idtr.limit = dt->size; 1632 svm->vmcb->save.idtr.base = dt->address ; 1633 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1634 } 1635 1636 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1637 { 1638 struct vcpu_svm *svm = to_svm(vcpu); 1639 1640 dt->size = svm->vmcb->save.gdtr.limit; 1641 dt->address = svm->vmcb->save.gdtr.base; 1642 } 1643 1644 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1645 { 1646 struct vcpu_svm *svm = to_svm(vcpu); 1647 1648 svm->vmcb->save.gdtr.limit = dt->size; 1649 svm->vmcb->save.gdtr.base = dt->address ; 1650 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1651 } 1652 1653 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1654 { 1655 struct vcpu_svm *svm = to_svm(vcpu); 1656 1657 /* 1658 * For guests that don't set guest_state_protected, the cr3 update is 1659 * handled via kvm_mmu_load() while entering the guest. For guests 1660 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to 1661 * VMCB save area now, since the save area will become the initial 1662 * contents of the VMSA, and future VMCB save area updates won't be 1663 * seen. 1664 */ 1665 if (sev_es_guest(vcpu->kvm)) { 1666 svm->vmcb->save.cr3 = cr3; 1667 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1668 } 1669 } 1670 1671 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1672 { 1673 struct vcpu_svm *svm = to_svm(vcpu); 1674 u64 hcr0 = cr0; 1675 bool old_paging = is_paging(vcpu); 1676 1677 #ifdef CONFIG_X86_64 1678 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { 1679 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1680 vcpu->arch.efer |= EFER_LMA; 1681 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1682 } 1683 1684 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1685 vcpu->arch.efer &= ~EFER_LMA; 1686 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1687 } 1688 } 1689 #endif 1690 vcpu->arch.cr0 = cr0; 1691 1692 if (!npt_enabled) { 1693 hcr0 |= X86_CR0_PG | X86_CR0_WP; 1694 if (old_paging != is_paging(vcpu)) 1695 svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); 1696 } 1697 1698 /* 1699 * re-enable caching here because the QEMU bios 1700 * does not do it - this results in some delay at 1701 * reboot 1702 */ 1703 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 1704 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1705 1706 svm->vmcb->save.cr0 = hcr0; 1707 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1708 1709 /* 1710 * SEV-ES guests must always keep the CR intercepts cleared. CR 1711 * tracking is done using the CR write traps. 1712 */ 1713 if (sev_es_guest(vcpu->kvm)) 1714 return; 1715 1716 if (hcr0 == cr0) { 1717 /* Selective CR0 write remains on. */ 1718 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 1719 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 1720 } else { 1721 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1722 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1723 } 1724 } 1725 1726 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1727 { 1728 return true; 1729 } 1730 1731 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1732 { 1733 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; 1734 unsigned long old_cr4 = vcpu->arch.cr4; 1735 1736 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1737 svm_flush_tlb_current(vcpu); 1738 1739 vcpu->arch.cr4 = cr4; 1740 if (!npt_enabled) { 1741 cr4 |= X86_CR4_PAE; 1742 1743 if (!is_paging(vcpu)) 1744 cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 1745 } 1746 cr4 |= host_cr4_mce; 1747 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1748 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 1749 1750 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 1751 kvm_update_cpuid_runtime(vcpu); 1752 } 1753 1754 static void svm_set_segment(struct kvm_vcpu *vcpu, 1755 struct kvm_segment *var, int seg) 1756 { 1757 struct vcpu_svm *svm = to_svm(vcpu); 1758 struct vmcb_seg *s = svm_seg(vcpu, seg); 1759 1760 s->base = var->base; 1761 s->limit = var->limit; 1762 s->selector = var->selector; 1763 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); 1764 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; 1765 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; 1766 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; 1767 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; 1768 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; 1769 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 1770 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1771 1772 /* 1773 * This is always accurate, except if SYSRET returned to a segment 1774 * with SS.DPL != 3. Intel does not have this quirk, and always 1775 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it 1776 * would entail passing the CPL to userspace and back. 1777 */ 1778 if (seg == VCPU_SREG_SS) 1779 /* This is symmetric with svm_get_segment() */ 1780 svm->vmcb->save.cpl = (var->dpl & 3); 1781 1782 vmcb_mark_dirty(svm->vmcb, VMCB_SEG); 1783 } 1784 1785 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu) 1786 { 1787 struct vcpu_svm *svm = to_svm(vcpu); 1788 1789 clr_exception_intercept(svm, BP_VECTOR); 1790 1791 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1792 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1793 set_exception_intercept(svm, BP_VECTOR); 1794 } 1795 } 1796 1797 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1798 { 1799 if (sd->next_asid > sd->max_asid) { 1800 ++sd->asid_generation; 1801 sd->next_asid = sd->min_asid; 1802 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1803 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 1804 } 1805 1806 svm->current_vmcb->asid_generation = sd->asid_generation; 1807 svm->asid = sd->next_asid++; 1808 } 1809 1810 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) 1811 { 1812 struct vmcb *vmcb = svm->vmcb; 1813 1814 if (svm->vcpu.arch.guest_state_protected) 1815 return; 1816 1817 if (unlikely(value != vmcb->save.dr6)) { 1818 vmcb->save.dr6 = value; 1819 vmcb_mark_dirty(vmcb, VMCB_DR); 1820 } 1821 } 1822 1823 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 1824 { 1825 struct vcpu_svm *svm = to_svm(vcpu); 1826 1827 if (vcpu->arch.guest_state_protected) 1828 return; 1829 1830 get_debugreg(vcpu->arch.db[0], 0); 1831 get_debugreg(vcpu->arch.db[1], 1); 1832 get_debugreg(vcpu->arch.db[2], 2); 1833 get_debugreg(vcpu->arch.db[3], 3); 1834 /* 1835 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here, 1836 * because db_interception might need it. We can do it before vmentry. 1837 */ 1838 vcpu->arch.dr6 = svm->vmcb->save.dr6; 1839 vcpu->arch.dr7 = svm->vmcb->save.dr7; 1840 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 1841 set_dr_intercepts(svm); 1842 } 1843 1844 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1845 { 1846 struct vcpu_svm *svm = to_svm(vcpu); 1847 1848 if (vcpu->arch.guest_state_protected) 1849 return; 1850 1851 svm->vmcb->save.dr7 = value; 1852 vmcb_mark_dirty(svm->vmcb, VMCB_DR); 1853 } 1854 1855 static int pf_interception(struct kvm_vcpu *vcpu) 1856 { 1857 struct vcpu_svm *svm = to_svm(vcpu); 1858 1859 u64 fault_address = svm->vmcb->control.exit_info_2; 1860 u64 error_code = svm->vmcb->control.exit_info_1; 1861 1862 return kvm_handle_page_fault(vcpu, error_code, fault_address, 1863 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 1864 svm->vmcb->control.insn_bytes : NULL, 1865 svm->vmcb->control.insn_len); 1866 } 1867 1868 static int npf_interception(struct kvm_vcpu *vcpu) 1869 { 1870 struct vcpu_svm *svm = to_svm(vcpu); 1871 1872 u64 fault_address = svm->vmcb->control.exit_info_2; 1873 u64 error_code = svm->vmcb->control.exit_info_1; 1874 1875 trace_kvm_page_fault(fault_address, error_code); 1876 return kvm_mmu_page_fault(vcpu, fault_address, error_code, 1877 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 1878 svm->vmcb->control.insn_bytes : NULL, 1879 svm->vmcb->control.insn_len); 1880 } 1881 1882 static int db_interception(struct kvm_vcpu *vcpu) 1883 { 1884 struct kvm_run *kvm_run = vcpu->run; 1885 struct vcpu_svm *svm = to_svm(vcpu); 1886 1887 if (!(vcpu->guest_debug & 1888 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1889 !svm->nmi_singlestep) { 1890 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; 1891 kvm_queue_exception_p(vcpu, DB_VECTOR, payload); 1892 return 1; 1893 } 1894 1895 if (svm->nmi_singlestep) { 1896 disable_nmi_singlestep(svm); 1897 /* Make sure we check for pending NMIs upon entry */ 1898 kvm_make_request(KVM_REQ_EVENT, vcpu); 1899 } 1900 1901 if (vcpu->guest_debug & 1902 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { 1903 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1904 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; 1905 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; 1906 kvm_run->debug.arch.pc = 1907 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1908 kvm_run->debug.arch.exception = DB_VECTOR; 1909 return 0; 1910 } 1911 1912 return 1; 1913 } 1914 1915 static int bp_interception(struct kvm_vcpu *vcpu) 1916 { 1917 struct vcpu_svm *svm = to_svm(vcpu); 1918 struct kvm_run *kvm_run = vcpu->run; 1919 1920 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1921 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1922 kvm_run->debug.arch.exception = BP_VECTOR; 1923 return 0; 1924 } 1925 1926 static int ud_interception(struct kvm_vcpu *vcpu) 1927 { 1928 return handle_ud(vcpu); 1929 } 1930 1931 static int ac_interception(struct kvm_vcpu *vcpu) 1932 { 1933 kvm_queue_exception_e(vcpu, AC_VECTOR, 0); 1934 return 1; 1935 } 1936 1937 static bool is_erratum_383(void) 1938 { 1939 int err, i; 1940 u64 value; 1941 1942 if (!erratum_383_found) 1943 return false; 1944 1945 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); 1946 if (err) 1947 return false; 1948 1949 /* Bit 62 may or may not be set for this mce */ 1950 value &= ~(1ULL << 62); 1951 1952 if (value != 0xb600000000010015ULL) 1953 return false; 1954 1955 /* Clear MCi_STATUS registers */ 1956 for (i = 0; i < 6; ++i) 1957 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); 1958 1959 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); 1960 if (!err) { 1961 u32 low, high; 1962 1963 value &= ~(1ULL << 2); 1964 low = lower_32_bits(value); 1965 high = upper_32_bits(value); 1966 1967 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); 1968 } 1969 1970 /* Flush tlb to evict multi-match entries */ 1971 __flush_tlb_all(); 1972 1973 return true; 1974 } 1975 1976 static void svm_handle_mce(struct kvm_vcpu *vcpu) 1977 { 1978 if (is_erratum_383()) { 1979 /* 1980 * Erratum 383 triggered. Guest state is corrupt so kill the 1981 * guest. 1982 */ 1983 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 1984 1985 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 1986 1987 return; 1988 } 1989 1990 /* 1991 * On an #MC intercept the MCE handler is not called automatically in 1992 * the host. So do it by hand here. 1993 */ 1994 kvm_machine_check(); 1995 } 1996 1997 static int mc_interception(struct kvm_vcpu *vcpu) 1998 { 1999 return 1; 2000 } 2001 2002 static int shutdown_interception(struct kvm_vcpu *vcpu) 2003 { 2004 struct kvm_run *kvm_run = vcpu->run; 2005 struct vcpu_svm *svm = to_svm(vcpu); 2006 2007 /* 2008 * The VM save area has already been encrypted so it 2009 * cannot be reinitialized - just terminate. 2010 */ 2011 if (sev_es_guest(vcpu->kvm)) 2012 return -EINVAL; 2013 2014 /* 2015 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put 2016 * the VMCB in a known good state. Unfortuately, KVM doesn't have 2017 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking 2018 * userspace. At a platform view, INIT is acceptable behavior as 2019 * there exist bare metal platforms that automatically INIT the CPU 2020 * in response to shutdown. 2021 */ 2022 clear_page(svm->vmcb); 2023 kvm_vcpu_reset(vcpu, true); 2024 2025 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2026 return 0; 2027 } 2028 2029 static int io_interception(struct kvm_vcpu *vcpu) 2030 { 2031 struct vcpu_svm *svm = to_svm(vcpu); 2032 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 2033 int size, in, string; 2034 unsigned port; 2035 2036 ++vcpu->stat.io_exits; 2037 string = (io_info & SVM_IOIO_STR_MASK) != 0; 2038 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 2039 port = io_info >> 16; 2040 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2041 2042 if (string) { 2043 if (sev_es_guest(vcpu->kvm)) 2044 return sev_es_string_io(svm, size, port, in); 2045 else 2046 return kvm_emulate_instruction(vcpu, 0); 2047 } 2048 2049 svm->next_rip = svm->vmcb->control.exit_info_2; 2050 2051 return kvm_fast_pio(vcpu, size, port, in); 2052 } 2053 2054 static int nmi_interception(struct kvm_vcpu *vcpu) 2055 { 2056 return 1; 2057 } 2058 2059 static int smi_interception(struct kvm_vcpu *vcpu) 2060 { 2061 return 1; 2062 } 2063 2064 static int intr_interception(struct kvm_vcpu *vcpu) 2065 { 2066 ++vcpu->stat.irq_exits; 2067 return 1; 2068 } 2069 2070 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) 2071 { 2072 struct vcpu_svm *svm = to_svm(vcpu); 2073 struct vmcb *vmcb12; 2074 struct kvm_host_map map; 2075 int ret; 2076 2077 if (nested_svm_check_permissions(vcpu)) 2078 return 1; 2079 2080 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); 2081 if (ret) { 2082 if (ret == -EINVAL) 2083 kvm_inject_gp(vcpu, 0); 2084 return 1; 2085 } 2086 2087 vmcb12 = map.hva; 2088 2089 ret = kvm_skip_emulated_instruction(vcpu); 2090 2091 if (vmload) { 2092 svm_copy_vmloadsave_state(svm->vmcb, vmcb12); 2093 svm->sysenter_eip_hi = 0; 2094 svm->sysenter_esp_hi = 0; 2095 } else { 2096 svm_copy_vmloadsave_state(vmcb12, svm->vmcb); 2097 } 2098 2099 kvm_vcpu_unmap(vcpu, &map, true); 2100 2101 return ret; 2102 } 2103 2104 static int vmload_interception(struct kvm_vcpu *vcpu) 2105 { 2106 return vmload_vmsave_interception(vcpu, true); 2107 } 2108 2109 static int vmsave_interception(struct kvm_vcpu *vcpu) 2110 { 2111 return vmload_vmsave_interception(vcpu, false); 2112 } 2113 2114 static int vmrun_interception(struct kvm_vcpu *vcpu) 2115 { 2116 if (nested_svm_check_permissions(vcpu)) 2117 return 1; 2118 2119 return nested_svm_vmrun(vcpu); 2120 } 2121 2122 enum { 2123 NONE_SVM_INSTR, 2124 SVM_INSTR_VMRUN, 2125 SVM_INSTR_VMLOAD, 2126 SVM_INSTR_VMSAVE, 2127 }; 2128 2129 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ 2130 static int svm_instr_opcode(struct kvm_vcpu *vcpu) 2131 { 2132 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 2133 2134 if (ctxt->b != 0x1 || ctxt->opcode_len != 2) 2135 return NONE_SVM_INSTR; 2136 2137 switch (ctxt->modrm) { 2138 case 0xd8: /* VMRUN */ 2139 return SVM_INSTR_VMRUN; 2140 case 0xda: /* VMLOAD */ 2141 return SVM_INSTR_VMLOAD; 2142 case 0xdb: /* VMSAVE */ 2143 return SVM_INSTR_VMSAVE; 2144 default: 2145 break; 2146 } 2147 2148 return NONE_SVM_INSTR; 2149 } 2150 2151 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) 2152 { 2153 const int guest_mode_exit_codes[] = { 2154 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, 2155 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, 2156 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, 2157 }; 2158 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { 2159 [SVM_INSTR_VMRUN] = vmrun_interception, 2160 [SVM_INSTR_VMLOAD] = vmload_interception, 2161 [SVM_INSTR_VMSAVE] = vmsave_interception, 2162 }; 2163 struct vcpu_svm *svm = to_svm(vcpu); 2164 int ret; 2165 2166 if (is_guest_mode(vcpu)) { 2167 /* Returns '1' or -errno on failure, '0' on success. */ 2168 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); 2169 if (ret) 2170 return ret; 2171 return 1; 2172 } 2173 return svm_instr_handlers[opcode](vcpu); 2174 } 2175 2176 /* 2177 * #GP handling code. Note that #GP can be triggered under the following two 2178 * cases: 2179 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on 2180 * some AMD CPUs when EAX of these instructions are in the reserved memory 2181 * regions (e.g. SMM memory on host). 2182 * 2) VMware backdoor 2183 */ 2184 static int gp_interception(struct kvm_vcpu *vcpu) 2185 { 2186 struct vcpu_svm *svm = to_svm(vcpu); 2187 u32 error_code = svm->vmcb->control.exit_info_1; 2188 int opcode; 2189 2190 /* Both #GP cases have zero error_code */ 2191 if (error_code) 2192 goto reinject; 2193 2194 /* Decode the instruction for usage later */ 2195 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) 2196 goto reinject; 2197 2198 opcode = svm_instr_opcode(vcpu); 2199 2200 if (opcode == NONE_SVM_INSTR) { 2201 if (!enable_vmware_backdoor) 2202 goto reinject; 2203 2204 /* 2205 * VMware backdoor emulation on #GP interception only handles 2206 * IN{S}, OUT{S}, and RDPMC. 2207 */ 2208 if (!is_guest_mode(vcpu)) 2209 return kvm_emulate_instruction(vcpu, 2210 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); 2211 } else { 2212 /* All SVM instructions expect page aligned RAX */ 2213 if (svm->vmcb->save.rax & ~PAGE_MASK) 2214 goto reinject; 2215 2216 return emulate_svm_instr(vcpu, opcode); 2217 } 2218 2219 reinject: 2220 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 2221 return 1; 2222 } 2223 2224 void svm_set_gif(struct vcpu_svm *svm, bool value) 2225 { 2226 if (value) { 2227 /* 2228 * If VGIF is enabled, the STGI intercept is only added to 2229 * detect the opening of the SMI/NMI window; remove it now. 2230 * Likewise, clear the VINTR intercept, we will set it 2231 * again while processing KVM_REQ_EVENT if needed. 2232 */ 2233 if (vgif) 2234 svm_clr_intercept(svm, INTERCEPT_STGI); 2235 if (svm_is_intercept(svm, INTERCEPT_VINTR)) 2236 svm_clear_vintr(svm); 2237 2238 enable_gif(svm); 2239 if (svm->vcpu.arch.smi_pending || 2240 svm->vcpu.arch.nmi_pending || 2241 kvm_cpu_has_injectable_intr(&svm->vcpu)) 2242 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 2243 } else { 2244 disable_gif(svm); 2245 2246 /* 2247 * After a CLGI no interrupts should come. But if vGIF is 2248 * in use, we still rely on the VINTR intercept (rather than 2249 * STGI) to detect an open interrupt window. 2250 */ 2251 if (!vgif) 2252 svm_clear_vintr(svm); 2253 } 2254 } 2255 2256 static int stgi_interception(struct kvm_vcpu *vcpu) 2257 { 2258 int ret; 2259 2260 if (nested_svm_check_permissions(vcpu)) 2261 return 1; 2262 2263 ret = kvm_skip_emulated_instruction(vcpu); 2264 svm_set_gif(to_svm(vcpu), true); 2265 return ret; 2266 } 2267 2268 static int clgi_interception(struct kvm_vcpu *vcpu) 2269 { 2270 int ret; 2271 2272 if (nested_svm_check_permissions(vcpu)) 2273 return 1; 2274 2275 ret = kvm_skip_emulated_instruction(vcpu); 2276 svm_set_gif(to_svm(vcpu), false); 2277 return ret; 2278 } 2279 2280 static int invlpga_interception(struct kvm_vcpu *vcpu) 2281 { 2282 gva_t gva = kvm_rax_read(vcpu); 2283 u32 asid = kvm_rcx_read(vcpu); 2284 2285 /* FIXME: Handle an address size prefix. */ 2286 if (!is_long_mode(vcpu)) 2287 gva = (u32)gva; 2288 2289 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva); 2290 2291 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 2292 kvm_mmu_invlpg(vcpu, gva); 2293 2294 return kvm_skip_emulated_instruction(vcpu); 2295 } 2296 2297 static int skinit_interception(struct kvm_vcpu *vcpu) 2298 { 2299 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu)); 2300 2301 kvm_queue_exception(vcpu, UD_VECTOR); 2302 return 1; 2303 } 2304 2305 static int task_switch_interception(struct kvm_vcpu *vcpu) 2306 { 2307 struct vcpu_svm *svm = to_svm(vcpu); 2308 u16 tss_selector; 2309 int reason; 2310 int int_type = svm->vmcb->control.exit_int_info & 2311 SVM_EXITINTINFO_TYPE_MASK; 2312 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; 2313 uint32_t type = 2314 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2315 uint32_t idt_v = 2316 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2317 bool has_error_code = false; 2318 u32 error_code = 0; 2319 2320 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2321 2322 if (svm->vmcb->control.exit_info_2 & 2323 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 2324 reason = TASK_SWITCH_IRET; 2325 else if (svm->vmcb->control.exit_info_2 & 2326 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 2327 reason = TASK_SWITCH_JMP; 2328 else if (idt_v) 2329 reason = TASK_SWITCH_GATE; 2330 else 2331 reason = TASK_SWITCH_CALL; 2332 2333 if (reason == TASK_SWITCH_GATE) { 2334 switch (type) { 2335 case SVM_EXITINTINFO_TYPE_NMI: 2336 vcpu->arch.nmi_injected = false; 2337 break; 2338 case SVM_EXITINTINFO_TYPE_EXEPT: 2339 if (svm->vmcb->control.exit_info_2 & 2340 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { 2341 has_error_code = true; 2342 error_code = 2343 (u32)svm->vmcb->control.exit_info_2; 2344 } 2345 kvm_clear_exception_queue(vcpu); 2346 break; 2347 case SVM_EXITINTINFO_TYPE_INTR: 2348 kvm_clear_interrupt_queue(vcpu); 2349 break; 2350 default: 2351 break; 2352 } 2353 } 2354 2355 if (reason != TASK_SWITCH_GATE || 2356 int_type == SVM_EXITINTINFO_TYPE_SOFT || 2357 (int_type == SVM_EXITINTINFO_TYPE_EXEPT && 2358 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { 2359 if (!svm_skip_emulated_instruction(vcpu)) 2360 return 0; 2361 } 2362 2363 if (int_type != SVM_EXITINTINFO_TYPE_SOFT) 2364 int_vec = -1; 2365 2366 return kvm_task_switch(vcpu, tss_selector, int_vec, reason, 2367 has_error_code, error_code); 2368 } 2369 2370 static int iret_interception(struct kvm_vcpu *vcpu) 2371 { 2372 struct vcpu_svm *svm = to_svm(vcpu); 2373 2374 ++vcpu->stat.nmi_window_exits; 2375 vcpu->arch.hflags |= HF_IRET_MASK; 2376 if (!sev_es_guest(vcpu->kvm)) { 2377 svm_clr_intercept(svm, INTERCEPT_IRET); 2378 svm->nmi_iret_rip = kvm_rip_read(vcpu); 2379 } 2380 kvm_make_request(KVM_REQ_EVENT, vcpu); 2381 return 1; 2382 } 2383 2384 static int invlpg_interception(struct kvm_vcpu *vcpu) 2385 { 2386 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2387 return kvm_emulate_instruction(vcpu, 0); 2388 2389 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1); 2390 return kvm_skip_emulated_instruction(vcpu); 2391 } 2392 2393 static int emulate_on_interception(struct kvm_vcpu *vcpu) 2394 { 2395 return kvm_emulate_instruction(vcpu, 0); 2396 } 2397 2398 static int rsm_interception(struct kvm_vcpu *vcpu) 2399 { 2400 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2); 2401 } 2402 2403 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, 2404 unsigned long val) 2405 { 2406 struct vcpu_svm *svm = to_svm(vcpu); 2407 unsigned long cr0 = vcpu->arch.cr0; 2408 bool ret = false; 2409 2410 if (!is_guest_mode(vcpu) || 2411 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) 2412 return false; 2413 2414 cr0 &= ~SVM_CR0_SELECTIVE_MASK; 2415 val &= ~SVM_CR0_SELECTIVE_MASK; 2416 2417 if (cr0 ^ val) { 2418 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 2419 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); 2420 } 2421 2422 return ret; 2423 } 2424 2425 #define CR_VALID (1ULL << 63) 2426 2427 static int cr_interception(struct kvm_vcpu *vcpu) 2428 { 2429 struct vcpu_svm *svm = to_svm(vcpu); 2430 int reg, cr; 2431 unsigned long val; 2432 int err; 2433 2434 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2435 return emulate_on_interception(vcpu); 2436 2437 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) 2438 return emulate_on_interception(vcpu); 2439 2440 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2441 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) 2442 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; 2443 else 2444 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 2445 2446 err = 0; 2447 if (cr >= 16) { /* mov to cr */ 2448 cr -= 16; 2449 val = kvm_register_read(vcpu, reg); 2450 trace_kvm_cr_write(cr, val); 2451 switch (cr) { 2452 case 0: 2453 if (!check_selective_cr0_intercepted(vcpu, val)) 2454 err = kvm_set_cr0(vcpu, val); 2455 else 2456 return 1; 2457 2458 break; 2459 case 3: 2460 err = kvm_set_cr3(vcpu, val); 2461 break; 2462 case 4: 2463 err = kvm_set_cr4(vcpu, val); 2464 break; 2465 case 8: 2466 err = kvm_set_cr8(vcpu, val); 2467 break; 2468 default: 2469 WARN(1, "unhandled write to CR%d", cr); 2470 kvm_queue_exception(vcpu, UD_VECTOR); 2471 return 1; 2472 } 2473 } else { /* mov from cr */ 2474 switch (cr) { 2475 case 0: 2476 val = kvm_read_cr0(vcpu); 2477 break; 2478 case 2: 2479 val = vcpu->arch.cr2; 2480 break; 2481 case 3: 2482 val = kvm_read_cr3(vcpu); 2483 break; 2484 case 4: 2485 val = kvm_read_cr4(vcpu); 2486 break; 2487 case 8: 2488 val = kvm_get_cr8(vcpu); 2489 break; 2490 default: 2491 WARN(1, "unhandled read from CR%d", cr); 2492 kvm_queue_exception(vcpu, UD_VECTOR); 2493 return 1; 2494 } 2495 kvm_register_write(vcpu, reg, val); 2496 trace_kvm_cr_read(cr, val); 2497 } 2498 return kvm_complete_insn_gp(vcpu, err); 2499 } 2500 2501 static int cr_trap(struct kvm_vcpu *vcpu) 2502 { 2503 struct vcpu_svm *svm = to_svm(vcpu); 2504 unsigned long old_value, new_value; 2505 unsigned int cr; 2506 int ret = 0; 2507 2508 new_value = (unsigned long)svm->vmcb->control.exit_info_1; 2509 2510 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP; 2511 switch (cr) { 2512 case 0: 2513 old_value = kvm_read_cr0(vcpu); 2514 svm_set_cr0(vcpu, new_value); 2515 2516 kvm_post_set_cr0(vcpu, old_value, new_value); 2517 break; 2518 case 4: 2519 old_value = kvm_read_cr4(vcpu); 2520 svm_set_cr4(vcpu, new_value); 2521 2522 kvm_post_set_cr4(vcpu, old_value, new_value); 2523 break; 2524 case 8: 2525 ret = kvm_set_cr8(vcpu, new_value); 2526 break; 2527 default: 2528 WARN(1, "unhandled CR%d write trap", cr); 2529 kvm_queue_exception(vcpu, UD_VECTOR); 2530 return 1; 2531 } 2532 2533 return kvm_complete_insn_gp(vcpu, ret); 2534 } 2535 2536 static int dr_interception(struct kvm_vcpu *vcpu) 2537 { 2538 struct vcpu_svm *svm = to_svm(vcpu); 2539 int reg, dr; 2540 unsigned long val; 2541 int err = 0; 2542 2543 if (vcpu->guest_debug == 0) { 2544 /* 2545 * No more DR vmexits; force a reload of the debug registers 2546 * and reenter on this instruction. The next vmexit will 2547 * retrieve the full state of the debug registers. 2548 */ 2549 clr_dr_intercepts(svm); 2550 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 2551 return 1; 2552 } 2553 2554 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) 2555 return emulate_on_interception(vcpu); 2556 2557 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2558 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; 2559 if (dr >= 16) { /* mov to DRn */ 2560 dr -= 16; 2561 val = kvm_register_read(vcpu, reg); 2562 err = kvm_set_dr(vcpu, dr, val); 2563 } else { 2564 kvm_get_dr(vcpu, dr, &val); 2565 kvm_register_write(vcpu, reg, val); 2566 } 2567 2568 return kvm_complete_insn_gp(vcpu, err); 2569 } 2570 2571 static int cr8_write_interception(struct kvm_vcpu *vcpu) 2572 { 2573 int r; 2574 2575 u8 cr8_prev = kvm_get_cr8(vcpu); 2576 /* instruction emulation calls kvm_set_cr8() */ 2577 r = cr_interception(vcpu); 2578 if (lapic_in_kernel(vcpu)) 2579 return r; 2580 if (cr8_prev <= kvm_get_cr8(vcpu)) 2581 return r; 2582 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 2583 return 0; 2584 } 2585 2586 static int efer_trap(struct kvm_vcpu *vcpu) 2587 { 2588 struct msr_data msr_info; 2589 int ret; 2590 2591 /* 2592 * Clear the EFER_SVME bit from EFER. The SVM code always sets this 2593 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against 2594 * whether the guest has X86_FEATURE_SVM - this avoids a failure if 2595 * the guest doesn't have X86_FEATURE_SVM. 2596 */ 2597 msr_info.host_initiated = false; 2598 msr_info.index = MSR_EFER; 2599 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME; 2600 ret = kvm_set_msr_common(vcpu, &msr_info); 2601 2602 return kvm_complete_insn_gp(vcpu, ret); 2603 } 2604 2605 static int svm_get_msr_feature(struct kvm_msr_entry *msr) 2606 { 2607 msr->data = 0; 2608 2609 switch (msr->index) { 2610 case MSR_F10H_DECFG: 2611 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) 2612 msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; 2613 break; 2614 case MSR_IA32_PERF_CAPABILITIES: 2615 return 0; 2616 default: 2617 return KVM_MSR_RET_INVALID; 2618 } 2619 2620 return 0; 2621 } 2622 2623 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2624 { 2625 struct vcpu_svm *svm = to_svm(vcpu); 2626 2627 switch (msr_info->index) { 2628 case MSR_AMD64_TSC_RATIO: 2629 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) 2630 return 1; 2631 msr_info->data = svm->tsc_ratio_msr; 2632 break; 2633 case MSR_STAR: 2634 msr_info->data = svm->vmcb01.ptr->save.star; 2635 break; 2636 #ifdef CONFIG_X86_64 2637 case MSR_LSTAR: 2638 msr_info->data = svm->vmcb01.ptr->save.lstar; 2639 break; 2640 case MSR_CSTAR: 2641 msr_info->data = svm->vmcb01.ptr->save.cstar; 2642 break; 2643 case MSR_KERNEL_GS_BASE: 2644 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; 2645 break; 2646 case MSR_SYSCALL_MASK: 2647 msr_info->data = svm->vmcb01.ptr->save.sfmask; 2648 break; 2649 #endif 2650 case MSR_IA32_SYSENTER_CS: 2651 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; 2652 break; 2653 case MSR_IA32_SYSENTER_EIP: 2654 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; 2655 if (guest_cpuid_is_intel(vcpu)) 2656 msr_info->data |= (u64)svm->sysenter_eip_hi << 32; 2657 break; 2658 case MSR_IA32_SYSENTER_ESP: 2659 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; 2660 if (guest_cpuid_is_intel(vcpu)) 2661 msr_info->data |= (u64)svm->sysenter_esp_hi << 32; 2662 break; 2663 case MSR_TSC_AUX: 2664 msr_info->data = svm->tsc_aux; 2665 break; 2666 case MSR_IA32_DEBUGCTLMSR: 2667 case MSR_IA32_LASTBRANCHFROMIP: 2668 case MSR_IA32_LASTBRANCHTOIP: 2669 case MSR_IA32_LASTINTFROMIP: 2670 case MSR_IA32_LASTINTTOIP: 2671 msr_info->data = svm_get_lbr_msr(svm, msr_info->index); 2672 break; 2673 case MSR_VM_HSAVE_PA: 2674 msr_info->data = svm->nested.hsave_msr; 2675 break; 2676 case MSR_VM_CR: 2677 msr_info->data = svm->nested.vm_cr_msr; 2678 break; 2679 case MSR_IA32_SPEC_CTRL: 2680 if (!msr_info->host_initiated && 2681 !guest_has_spec_ctrl_msr(vcpu)) 2682 return 1; 2683 2684 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2685 msr_info->data = svm->vmcb->save.spec_ctrl; 2686 else 2687 msr_info->data = svm->spec_ctrl; 2688 break; 2689 case MSR_AMD64_VIRT_SPEC_CTRL: 2690 if (!msr_info->host_initiated && 2691 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2692 return 1; 2693 2694 msr_info->data = svm->virt_spec_ctrl; 2695 break; 2696 case MSR_F15H_IC_CFG: { 2697 2698 int family, model; 2699 2700 family = guest_cpuid_family(vcpu); 2701 model = guest_cpuid_model(vcpu); 2702 2703 if (family < 0 || model < 0) 2704 return kvm_get_msr_common(vcpu, msr_info); 2705 2706 msr_info->data = 0; 2707 2708 if (family == 0x15 && 2709 (model >= 0x2 && model < 0x20)) 2710 msr_info->data = 0x1E; 2711 } 2712 break; 2713 case MSR_F10H_DECFG: 2714 msr_info->data = svm->msr_decfg; 2715 break; 2716 default: 2717 return kvm_get_msr_common(vcpu, msr_info); 2718 } 2719 return 0; 2720 } 2721 2722 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2723 { 2724 struct vcpu_svm *svm = to_svm(vcpu); 2725 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) 2726 return kvm_complete_insn_gp(vcpu, err); 2727 2728 ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); 2729 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 2730 X86_TRAP_GP | 2731 SVM_EVTINJ_TYPE_EXEPT | 2732 SVM_EVTINJ_VALID); 2733 return 1; 2734 } 2735 2736 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) 2737 { 2738 struct vcpu_svm *svm = to_svm(vcpu); 2739 int svm_dis, chg_mask; 2740 2741 if (data & ~SVM_VM_CR_VALID_MASK) 2742 return 1; 2743 2744 chg_mask = SVM_VM_CR_VALID_MASK; 2745 2746 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) 2747 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); 2748 2749 svm->nested.vm_cr_msr &= ~chg_mask; 2750 svm->nested.vm_cr_msr |= (data & chg_mask); 2751 2752 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; 2753 2754 /* check for svm_disable while efer.svme is set */ 2755 if (svm_dis && (vcpu->arch.efer & EFER_SVME)) 2756 return 1; 2757 2758 return 0; 2759 } 2760 2761 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2762 { 2763 struct vcpu_svm *svm = to_svm(vcpu); 2764 int r; 2765 2766 u32 ecx = msr->index; 2767 u64 data = msr->data; 2768 switch (ecx) { 2769 case MSR_AMD64_TSC_RATIO: 2770 2771 if (!svm->tsc_scaling_enabled) { 2772 2773 if (!msr->host_initiated) 2774 return 1; 2775 /* 2776 * In case TSC scaling is not enabled, always 2777 * leave this MSR at the default value. 2778 * 2779 * Due to bug in qemu 6.2.0, it would try to set 2780 * this msr to 0 if tsc scaling is not enabled. 2781 * Ignore this value as well. 2782 */ 2783 if (data != 0 && data != svm->tsc_ratio_msr) 2784 return 1; 2785 break; 2786 } 2787 2788 if (data & SVM_TSC_RATIO_RSVD) 2789 return 1; 2790 2791 svm->tsc_ratio_msr = data; 2792 2793 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) 2794 nested_svm_update_tsc_ratio_msr(vcpu); 2795 2796 break; 2797 case MSR_IA32_CR_PAT: 2798 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 2799 return 1; 2800 vcpu->arch.pat = data; 2801 svm->vmcb01.ptr->save.g_pat = data; 2802 if (is_guest_mode(vcpu)) 2803 nested_vmcb02_compute_g_pat(svm); 2804 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 2805 break; 2806 case MSR_IA32_SPEC_CTRL: 2807 if (!msr->host_initiated && 2808 !guest_has_spec_ctrl_msr(vcpu)) 2809 return 1; 2810 2811 if (kvm_spec_ctrl_test_value(data)) 2812 return 1; 2813 2814 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2815 svm->vmcb->save.spec_ctrl = data; 2816 else 2817 svm->spec_ctrl = data; 2818 if (!data) 2819 break; 2820 2821 /* 2822 * For non-nested: 2823 * When it's written (to non-zero) for the first time, pass 2824 * it through. 2825 * 2826 * For nested: 2827 * The handling of the MSR bitmap for L2 guests is done in 2828 * nested_svm_vmrun_msrpm. 2829 * We update the L1 MSR bit as well since it will end up 2830 * touching the MSR anyway now. 2831 */ 2832 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 2833 break; 2834 case MSR_IA32_PRED_CMD: 2835 if (!msr->host_initiated && 2836 !guest_has_pred_cmd_msr(vcpu)) 2837 return 1; 2838 2839 if (data & ~PRED_CMD_IBPB) 2840 return 1; 2841 if (!boot_cpu_has(X86_FEATURE_IBPB)) 2842 return 1; 2843 if (!data) 2844 break; 2845 2846 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); 2847 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); 2848 break; 2849 case MSR_AMD64_VIRT_SPEC_CTRL: 2850 if (!msr->host_initiated && 2851 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2852 return 1; 2853 2854 if (data & ~SPEC_CTRL_SSBD) 2855 return 1; 2856 2857 svm->virt_spec_ctrl = data; 2858 break; 2859 case MSR_STAR: 2860 svm->vmcb01.ptr->save.star = data; 2861 break; 2862 #ifdef CONFIG_X86_64 2863 case MSR_LSTAR: 2864 svm->vmcb01.ptr->save.lstar = data; 2865 break; 2866 case MSR_CSTAR: 2867 svm->vmcb01.ptr->save.cstar = data; 2868 break; 2869 case MSR_KERNEL_GS_BASE: 2870 svm->vmcb01.ptr->save.kernel_gs_base = data; 2871 break; 2872 case MSR_SYSCALL_MASK: 2873 svm->vmcb01.ptr->save.sfmask = data; 2874 break; 2875 #endif 2876 case MSR_IA32_SYSENTER_CS: 2877 svm->vmcb01.ptr->save.sysenter_cs = data; 2878 break; 2879 case MSR_IA32_SYSENTER_EIP: 2880 svm->vmcb01.ptr->save.sysenter_eip = (u32)data; 2881 /* 2882 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs 2883 * when we spoof an Intel vendor ID (for cross vendor migration). 2884 * In this case we use this intercept to track the high 2885 * 32 bit part of these msrs to support Intel's 2886 * implementation of SYSENTER/SYSEXIT. 2887 */ 2888 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; 2889 break; 2890 case MSR_IA32_SYSENTER_ESP: 2891 svm->vmcb01.ptr->save.sysenter_esp = (u32)data; 2892 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; 2893 break; 2894 case MSR_TSC_AUX: 2895 /* 2896 * TSC_AUX is usually changed only during boot and never read 2897 * directly. Intercept TSC_AUX instead of exposing it to the 2898 * guest via direct_access_msrs, and switch it via user return. 2899 */ 2900 preempt_disable(); 2901 r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); 2902 preempt_enable(); 2903 if (r) 2904 return 1; 2905 2906 svm->tsc_aux = data; 2907 break; 2908 case MSR_IA32_DEBUGCTLMSR: 2909 if (!lbrv) { 2910 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 2911 __func__, data); 2912 break; 2913 } 2914 if (data & DEBUGCTL_RESERVED_BITS) 2915 return 1; 2916 2917 if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) 2918 svm->vmcb->save.dbgctl = data; 2919 else 2920 svm->vmcb01.ptr->save.dbgctl = data; 2921 2922 svm_update_lbrv(vcpu); 2923 2924 break; 2925 case MSR_VM_HSAVE_PA: 2926 /* 2927 * Old kernels did not validate the value written to 2928 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid 2929 * value to allow live migrating buggy or malicious guests 2930 * originating from those kernels. 2931 */ 2932 if (!msr->host_initiated && !page_address_valid(vcpu, data)) 2933 return 1; 2934 2935 svm->nested.hsave_msr = data & PAGE_MASK; 2936 break; 2937 case MSR_VM_CR: 2938 return svm_set_vm_cr(vcpu, data); 2939 case MSR_VM_IGNNE: 2940 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 2941 break; 2942 case MSR_F10H_DECFG: { 2943 struct kvm_msr_entry msr_entry; 2944 2945 msr_entry.index = msr->index; 2946 if (svm_get_msr_feature(&msr_entry)) 2947 return 1; 2948 2949 /* Check the supported bits */ 2950 if (data & ~msr_entry.data) 2951 return 1; 2952 2953 /* Don't allow the guest to change a bit, #GP */ 2954 if (!msr->host_initiated && (data ^ msr_entry.data)) 2955 return 1; 2956 2957 svm->msr_decfg = data; 2958 break; 2959 } 2960 default: 2961 return kvm_set_msr_common(vcpu, msr); 2962 } 2963 return 0; 2964 } 2965 2966 static int msr_interception(struct kvm_vcpu *vcpu) 2967 { 2968 if (to_svm(vcpu)->vmcb->control.exit_info_1) 2969 return kvm_emulate_wrmsr(vcpu); 2970 else 2971 return kvm_emulate_rdmsr(vcpu); 2972 } 2973 2974 static int interrupt_window_interception(struct kvm_vcpu *vcpu) 2975 { 2976 kvm_make_request(KVM_REQ_EVENT, vcpu); 2977 svm_clear_vintr(to_svm(vcpu)); 2978 2979 /* 2980 * If not running nested, for AVIC, the only reason to end up here is ExtINTs. 2981 * In this case AVIC was temporarily disabled for 2982 * requesting the IRQ window and we have to re-enable it. 2983 * 2984 * If running nested, still remove the VM wide AVIC inhibit to 2985 * support case in which the interrupt window was requested when the 2986 * vCPU was not running nested. 2987 2988 * All vCPUs which run still run nested, will remain to have their 2989 * AVIC still inhibited due to per-cpu AVIC inhibition. 2990 */ 2991 kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 2992 2993 ++vcpu->stat.irq_window_exits; 2994 return 1; 2995 } 2996 2997 static int pause_interception(struct kvm_vcpu *vcpu) 2998 { 2999 bool in_kernel; 3000 /* 3001 * CPL is not made available for an SEV-ES guest, therefore 3002 * vcpu->arch.preempted_in_kernel can never be true. Just 3003 * set in_kernel to false as well. 3004 */ 3005 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; 3006 3007 grow_ple_window(vcpu); 3008 3009 kvm_vcpu_on_spin(vcpu, in_kernel); 3010 return kvm_skip_emulated_instruction(vcpu); 3011 } 3012 3013 static int invpcid_interception(struct kvm_vcpu *vcpu) 3014 { 3015 struct vcpu_svm *svm = to_svm(vcpu); 3016 unsigned long type; 3017 gva_t gva; 3018 3019 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 3020 kvm_queue_exception(vcpu, UD_VECTOR); 3021 return 1; 3022 } 3023 3024 /* 3025 * For an INVPCID intercept: 3026 * EXITINFO1 provides the linear address of the memory operand. 3027 * EXITINFO2 provides the contents of the register operand. 3028 */ 3029 type = svm->vmcb->control.exit_info_2; 3030 gva = svm->vmcb->control.exit_info_1; 3031 3032 return kvm_handle_invpcid(vcpu, type, gva); 3033 } 3034 3035 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { 3036 [SVM_EXIT_READ_CR0] = cr_interception, 3037 [SVM_EXIT_READ_CR3] = cr_interception, 3038 [SVM_EXIT_READ_CR4] = cr_interception, 3039 [SVM_EXIT_READ_CR8] = cr_interception, 3040 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, 3041 [SVM_EXIT_WRITE_CR0] = cr_interception, 3042 [SVM_EXIT_WRITE_CR3] = cr_interception, 3043 [SVM_EXIT_WRITE_CR4] = cr_interception, 3044 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3045 [SVM_EXIT_READ_DR0] = dr_interception, 3046 [SVM_EXIT_READ_DR1] = dr_interception, 3047 [SVM_EXIT_READ_DR2] = dr_interception, 3048 [SVM_EXIT_READ_DR3] = dr_interception, 3049 [SVM_EXIT_READ_DR4] = dr_interception, 3050 [SVM_EXIT_READ_DR5] = dr_interception, 3051 [SVM_EXIT_READ_DR6] = dr_interception, 3052 [SVM_EXIT_READ_DR7] = dr_interception, 3053 [SVM_EXIT_WRITE_DR0] = dr_interception, 3054 [SVM_EXIT_WRITE_DR1] = dr_interception, 3055 [SVM_EXIT_WRITE_DR2] = dr_interception, 3056 [SVM_EXIT_WRITE_DR3] = dr_interception, 3057 [SVM_EXIT_WRITE_DR4] = dr_interception, 3058 [SVM_EXIT_WRITE_DR5] = dr_interception, 3059 [SVM_EXIT_WRITE_DR6] = dr_interception, 3060 [SVM_EXIT_WRITE_DR7] = dr_interception, 3061 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3062 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3063 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3064 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 3065 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 3066 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 3067 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, 3068 [SVM_EXIT_INTR] = intr_interception, 3069 [SVM_EXIT_NMI] = nmi_interception, 3070 [SVM_EXIT_SMI] = smi_interception, 3071 [SVM_EXIT_VINTR] = interrupt_window_interception, 3072 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, 3073 [SVM_EXIT_CPUID] = kvm_emulate_cpuid, 3074 [SVM_EXIT_IRET] = iret_interception, 3075 [SVM_EXIT_INVD] = kvm_emulate_invd, 3076 [SVM_EXIT_PAUSE] = pause_interception, 3077 [SVM_EXIT_HLT] = kvm_emulate_halt, 3078 [SVM_EXIT_INVLPG] = invlpg_interception, 3079 [SVM_EXIT_INVLPGA] = invlpga_interception, 3080 [SVM_EXIT_IOIO] = io_interception, 3081 [SVM_EXIT_MSR] = msr_interception, 3082 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 3083 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 3084 [SVM_EXIT_VMRUN] = vmrun_interception, 3085 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, 3086 [SVM_EXIT_VMLOAD] = vmload_interception, 3087 [SVM_EXIT_VMSAVE] = vmsave_interception, 3088 [SVM_EXIT_STGI] = stgi_interception, 3089 [SVM_EXIT_CLGI] = clgi_interception, 3090 [SVM_EXIT_SKINIT] = skinit_interception, 3091 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, 3092 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, 3093 [SVM_EXIT_MONITOR] = kvm_emulate_monitor, 3094 [SVM_EXIT_MWAIT] = kvm_emulate_mwait, 3095 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, 3096 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, 3097 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, 3098 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, 3099 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, 3100 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, 3101 [SVM_EXIT_INVPCID] = invpcid_interception, 3102 [SVM_EXIT_NPF] = npf_interception, 3103 [SVM_EXIT_RSM] = rsm_interception, 3104 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 3105 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, 3106 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, 3107 }; 3108 3109 static void dump_vmcb(struct kvm_vcpu *vcpu) 3110 { 3111 struct vcpu_svm *svm = to_svm(vcpu); 3112 struct vmcb_control_area *control = &svm->vmcb->control; 3113 struct vmcb_save_area *save = &svm->vmcb->save; 3114 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; 3115 3116 if (!dump_invalid_vmcb) { 3117 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3118 return; 3119 } 3120 3121 pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", 3122 svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); 3123 pr_err("VMCB Control Area:\n"); 3124 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); 3125 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); 3126 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); 3127 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); 3128 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); 3129 pr_err("%-20s%08x %08x\n", "intercepts:", 3130 control->intercepts[INTERCEPT_WORD3], 3131 control->intercepts[INTERCEPT_WORD4]); 3132 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); 3133 pr_err("%-20s%d\n", "pause filter threshold:", 3134 control->pause_filter_thresh); 3135 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); 3136 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); 3137 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); 3138 pr_err("%-20s%d\n", "asid:", control->asid); 3139 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); 3140 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); 3141 pr_err("%-20s%08x\n", "int_vector:", control->int_vector); 3142 pr_err("%-20s%08x\n", "int_state:", control->int_state); 3143 pr_err("%-20s%08x\n", "exit_code:", control->exit_code); 3144 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); 3145 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); 3146 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); 3147 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); 3148 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); 3149 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); 3150 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 3151 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); 3152 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 3153 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 3154 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 3155 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 3156 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 3157 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); 3158 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); 3159 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); 3160 pr_err("VMCB State Save Area:\n"); 3161 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3162 "es:", 3163 save->es.selector, save->es.attrib, 3164 save->es.limit, save->es.base); 3165 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3166 "cs:", 3167 save->cs.selector, save->cs.attrib, 3168 save->cs.limit, save->cs.base); 3169 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3170 "ss:", 3171 save->ss.selector, save->ss.attrib, 3172 save->ss.limit, save->ss.base); 3173 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3174 "ds:", 3175 save->ds.selector, save->ds.attrib, 3176 save->ds.limit, save->ds.base); 3177 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3178 "fs:", 3179 save01->fs.selector, save01->fs.attrib, 3180 save01->fs.limit, save01->fs.base); 3181 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3182 "gs:", 3183 save01->gs.selector, save01->gs.attrib, 3184 save01->gs.limit, save01->gs.base); 3185 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3186 "gdtr:", 3187 save->gdtr.selector, save->gdtr.attrib, 3188 save->gdtr.limit, save->gdtr.base); 3189 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3190 "ldtr:", 3191 save01->ldtr.selector, save01->ldtr.attrib, 3192 save01->ldtr.limit, save01->ldtr.base); 3193 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3194 "idtr:", 3195 save->idtr.selector, save->idtr.attrib, 3196 save->idtr.limit, save->idtr.base); 3197 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3198 "tr:", 3199 save01->tr.selector, save01->tr.attrib, 3200 save01->tr.limit, save01->tr.base); 3201 pr_err("vmpl: %d cpl: %d efer: %016llx\n", 3202 save->vmpl, save->cpl, save->efer); 3203 pr_err("%-15s %016llx %-13s %016llx\n", 3204 "cr0:", save->cr0, "cr2:", save->cr2); 3205 pr_err("%-15s %016llx %-13s %016llx\n", 3206 "cr3:", save->cr3, "cr4:", save->cr4); 3207 pr_err("%-15s %016llx %-13s %016llx\n", 3208 "dr6:", save->dr6, "dr7:", save->dr7); 3209 pr_err("%-15s %016llx %-13s %016llx\n", 3210 "rip:", save->rip, "rflags:", save->rflags); 3211 pr_err("%-15s %016llx %-13s %016llx\n", 3212 "rsp:", save->rsp, "rax:", save->rax); 3213 pr_err("%-15s %016llx %-13s %016llx\n", 3214 "star:", save01->star, "lstar:", save01->lstar); 3215 pr_err("%-15s %016llx %-13s %016llx\n", 3216 "cstar:", save01->cstar, "sfmask:", save01->sfmask); 3217 pr_err("%-15s %016llx %-13s %016llx\n", 3218 "kernel_gs_base:", save01->kernel_gs_base, 3219 "sysenter_cs:", save01->sysenter_cs); 3220 pr_err("%-15s %016llx %-13s %016llx\n", 3221 "sysenter_esp:", save01->sysenter_esp, 3222 "sysenter_eip:", save01->sysenter_eip); 3223 pr_err("%-15s %016llx %-13s %016llx\n", 3224 "gpat:", save->g_pat, "dbgctl:", save->dbgctl); 3225 pr_err("%-15s %016llx %-13s %016llx\n", 3226 "br_from:", save->br_from, "br_to:", save->br_to); 3227 pr_err("%-15s %016llx %-13s %016llx\n", 3228 "excp_from:", save->last_excp_from, 3229 "excp_to:", save->last_excp_to); 3230 } 3231 3232 static bool svm_check_exit_valid(u64 exit_code) 3233 { 3234 return (exit_code < ARRAY_SIZE(svm_exit_handlers) && 3235 svm_exit_handlers[exit_code]); 3236 } 3237 3238 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) 3239 { 3240 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); 3241 dump_vmcb(vcpu); 3242 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3243 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 3244 vcpu->run->internal.ndata = 2; 3245 vcpu->run->internal.data[0] = exit_code; 3246 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 3247 return 0; 3248 } 3249 3250 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) 3251 { 3252 if (!svm_check_exit_valid(exit_code)) 3253 return svm_handle_invalid_exit(vcpu, exit_code); 3254 3255 #ifdef CONFIG_RETPOLINE 3256 if (exit_code == SVM_EXIT_MSR) 3257 return msr_interception(vcpu); 3258 else if (exit_code == SVM_EXIT_VINTR) 3259 return interrupt_window_interception(vcpu); 3260 else if (exit_code == SVM_EXIT_INTR) 3261 return intr_interception(vcpu); 3262 else if (exit_code == SVM_EXIT_HLT) 3263 return kvm_emulate_halt(vcpu); 3264 else if (exit_code == SVM_EXIT_NPF) 3265 return npf_interception(vcpu); 3266 #endif 3267 return svm_exit_handlers[exit_code](vcpu); 3268 } 3269 3270 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 3271 u64 *info1, u64 *info2, 3272 u32 *intr_info, u32 *error_code) 3273 { 3274 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 3275 3276 *reason = control->exit_code; 3277 *info1 = control->exit_info_1; 3278 *info2 = control->exit_info_2; 3279 *intr_info = control->exit_int_info; 3280 if ((*intr_info & SVM_EXITINTINFO_VALID) && 3281 (*intr_info & SVM_EXITINTINFO_VALID_ERR)) 3282 *error_code = control->exit_int_info_err; 3283 else 3284 *error_code = 0; 3285 } 3286 3287 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 3288 { 3289 struct vcpu_svm *svm = to_svm(vcpu); 3290 struct kvm_run *kvm_run = vcpu->run; 3291 u32 exit_code = svm->vmcb->control.exit_code; 3292 3293 trace_kvm_exit(vcpu, KVM_ISA_SVM); 3294 3295 /* SEV-ES guests must use the CR write traps to track CR registers. */ 3296 if (!sev_es_guest(vcpu->kvm)) { 3297 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) 3298 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3299 if (npt_enabled) 3300 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3301 } 3302 3303 if (is_guest_mode(vcpu)) { 3304 int vmexit; 3305 3306 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); 3307 3308 vmexit = nested_svm_exit_special(svm); 3309 3310 if (vmexit == NESTED_EXIT_CONTINUE) 3311 vmexit = nested_svm_exit_handled(svm); 3312 3313 if (vmexit == NESTED_EXIT_DONE) 3314 return 1; 3315 } 3316 3317 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 3318 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3319 kvm_run->fail_entry.hardware_entry_failure_reason 3320 = svm->vmcb->control.exit_code; 3321 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 3322 dump_vmcb(vcpu); 3323 return 0; 3324 } 3325 3326 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 3327 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3328 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && 3329 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) 3330 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " 3331 "exit_code 0x%x\n", 3332 __func__, svm->vmcb->control.exit_int_info, 3333 exit_code); 3334 3335 if (exit_fastpath != EXIT_FASTPATH_NONE) 3336 return 1; 3337 3338 return svm_invoke_exit_handler(vcpu, exit_code); 3339 } 3340 3341 static void reload_tss(struct kvm_vcpu *vcpu) 3342 { 3343 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3344 3345 sd->tss_desc->type = 9; /* available 32/64-bit TSS */ 3346 load_TR_desc(); 3347 } 3348 3349 static void pre_svm_run(struct kvm_vcpu *vcpu) 3350 { 3351 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3352 struct vcpu_svm *svm = to_svm(vcpu); 3353 3354 /* 3355 * If the previous vmrun of the vmcb occurred on a different physical 3356 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's 3357 * vmcb clean bits are per logical CPU, as are KVM's asid assignments. 3358 */ 3359 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) { 3360 svm->current_vmcb->asid_generation = 0; 3361 vmcb_mark_all_dirty(svm->vmcb); 3362 svm->current_vmcb->cpu = vcpu->cpu; 3363 } 3364 3365 if (sev_guest(vcpu->kvm)) 3366 return pre_sev_run(svm, vcpu->cpu); 3367 3368 /* FIXME: handle wraparound of asid_generation */ 3369 if (svm->current_vmcb->asid_generation != sd->asid_generation) 3370 new_asid(svm, sd); 3371 } 3372 3373 static void svm_inject_nmi(struct kvm_vcpu *vcpu) 3374 { 3375 struct vcpu_svm *svm = to_svm(vcpu); 3376 3377 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3378 vcpu->arch.hflags |= HF_NMI_MASK; 3379 if (!sev_es_guest(vcpu->kvm)) 3380 svm_set_intercept(svm, INTERCEPT_IRET); 3381 ++vcpu->stat.nmi_injections; 3382 } 3383 3384 static void svm_inject_irq(struct kvm_vcpu *vcpu) 3385 { 3386 struct vcpu_svm *svm = to_svm(vcpu); 3387 3388 BUG_ON(!(gif_set(svm))); 3389 3390 trace_kvm_inj_virq(vcpu->arch.interrupt.nr); 3391 ++vcpu->stat.irq_injections; 3392 3393 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 3394 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 3395 } 3396 3397 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, 3398 int trig_mode, int vector) 3399 { 3400 /* 3401 * vcpu->arch.apicv_active must be read after vcpu->mode. 3402 * Pairs with smp_store_release in vcpu_enter_guest. 3403 */ 3404 bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); 3405 3406 if (!READ_ONCE(vcpu->arch.apicv_active)) { 3407 /* Process the interrupt via inject_pending_event */ 3408 kvm_make_request(KVM_REQ_EVENT, vcpu); 3409 kvm_vcpu_kick(vcpu); 3410 return; 3411 } 3412 3413 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 3414 if (in_guest_mode) { 3415 /* 3416 * Signal the doorbell to tell hardware to inject the IRQ. If 3417 * the vCPU exits the guest before the doorbell chimes, hardware 3418 * will automatically process AVIC interrupts at the next VMRUN. 3419 */ 3420 avic_ring_doorbell(vcpu); 3421 } else { 3422 /* 3423 * Wake the vCPU if it was blocking. KVM will then detect the 3424 * pending IRQ when checking if the vCPU has a wake event. 3425 */ 3426 kvm_vcpu_wake_up(vcpu); 3427 } 3428 } 3429 3430 static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 3431 int trig_mode, int vector) 3432 { 3433 kvm_lapic_set_irr(vector, apic); 3434 3435 /* 3436 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in 3437 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before 3438 * the read of guest_mode. This guarantees that either VMRUN will see 3439 * and process the new vIRR entry, or that svm_complete_interrupt_delivery 3440 * will signal the doorbell if the CPU has already entered the guest. 3441 */ 3442 smp_mb__after_atomic(); 3443 svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); 3444 } 3445 3446 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 3447 { 3448 struct vcpu_svm *svm = to_svm(vcpu); 3449 3450 /* 3451 * SEV-ES guests must always keep the CR intercepts cleared. CR 3452 * tracking is done using the CR write traps. 3453 */ 3454 if (sev_es_guest(vcpu->kvm)) 3455 return; 3456 3457 if (nested_svm_virtualize_tpr(vcpu)) 3458 return; 3459 3460 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 3461 3462 if (irr == -1) 3463 return; 3464 3465 if (tpr >= irr) 3466 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 3467 } 3468 3469 bool svm_nmi_blocked(struct kvm_vcpu *vcpu) 3470 { 3471 struct vcpu_svm *svm = to_svm(vcpu); 3472 struct vmcb *vmcb = svm->vmcb; 3473 bool ret; 3474 3475 if (!gif_set(svm)) 3476 return true; 3477 3478 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3479 return false; 3480 3481 ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || 3482 (vcpu->arch.hflags & HF_NMI_MASK); 3483 3484 return ret; 3485 } 3486 3487 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3488 { 3489 struct vcpu_svm *svm = to_svm(vcpu); 3490 if (svm->nested.nested_run_pending) 3491 return -EBUSY; 3492 3493 if (svm_nmi_blocked(vcpu)) 3494 return 0; 3495 3496 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 3497 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3498 return -EBUSY; 3499 return 1; 3500 } 3501 3502 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 3503 { 3504 return !!(vcpu->arch.hflags & HF_NMI_MASK); 3505 } 3506 3507 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 3508 { 3509 struct vcpu_svm *svm = to_svm(vcpu); 3510 3511 if (masked) { 3512 vcpu->arch.hflags |= HF_NMI_MASK; 3513 if (!sev_es_guest(vcpu->kvm)) 3514 svm_set_intercept(svm, INTERCEPT_IRET); 3515 } else { 3516 vcpu->arch.hflags &= ~HF_NMI_MASK; 3517 if (!sev_es_guest(vcpu->kvm)) 3518 svm_clr_intercept(svm, INTERCEPT_IRET); 3519 } 3520 } 3521 3522 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) 3523 { 3524 struct vcpu_svm *svm = to_svm(vcpu); 3525 struct vmcb *vmcb = svm->vmcb; 3526 3527 if (!gif_set(svm)) 3528 return true; 3529 3530 if (is_guest_mode(vcpu)) { 3531 /* As long as interrupts are being delivered... */ 3532 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) 3533 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) 3534 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) 3535 return true; 3536 3537 /* ... vmexits aren't blocked by the interrupt shadow */ 3538 if (nested_exit_on_intr(svm)) 3539 return false; 3540 } else { 3541 if (!svm_get_if_flag(vcpu)) 3542 return true; 3543 } 3544 3545 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); 3546 } 3547 3548 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3549 { 3550 struct vcpu_svm *svm = to_svm(vcpu); 3551 3552 if (svm->nested.nested_run_pending) 3553 return -EBUSY; 3554 3555 if (svm_interrupt_blocked(vcpu)) 3556 return 0; 3557 3558 /* 3559 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 3560 * e.g. if the IRQ arrived asynchronously after checking nested events. 3561 */ 3562 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) 3563 return -EBUSY; 3564 3565 return 1; 3566 } 3567 3568 static void svm_enable_irq_window(struct kvm_vcpu *vcpu) 3569 { 3570 struct vcpu_svm *svm = to_svm(vcpu); 3571 3572 /* 3573 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 3574 * 1, because that's a separate STGI/VMRUN intercept. The next time we 3575 * get that intercept, this function will be called again though and 3576 * we'll get the vintr intercept. However, if the vGIF feature is 3577 * enabled, the STGI interception will not occur. Enable the irq 3578 * window under the assumption that the hardware will set the GIF. 3579 */ 3580 if (vgif || gif_set(svm)) { 3581 /* 3582 * IRQ window is not needed when AVIC is enabled, 3583 * unless we have pending ExtINT since it cannot be injected 3584 * via AVIC. In such case, KVM needs to temporarily disable AVIC, 3585 * and fallback to injecting IRQ via V_IRQ. 3586 * 3587 * If running nested, AVIC is already locally inhibited 3588 * on this vCPU, therefore there is no need to request 3589 * the VM wide AVIC inhibition. 3590 */ 3591 if (!is_guest_mode(vcpu)) 3592 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 3593 3594 svm_set_vintr(svm); 3595 } 3596 } 3597 3598 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) 3599 { 3600 struct vcpu_svm *svm = to_svm(vcpu); 3601 3602 if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) 3603 return; /* IRET will cause a vm exit */ 3604 3605 if (!gif_set(svm)) { 3606 if (vgif) 3607 svm_set_intercept(svm, INTERCEPT_STGI); 3608 return; /* STGI will cause a vm exit */ 3609 } 3610 3611 /* 3612 * Something prevents NMI from been injected. Single step over possible 3613 * problem (IRET or exception injection or interrupt shadow) 3614 */ 3615 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); 3616 svm->nmi_singlestep = true; 3617 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3618 } 3619 3620 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) 3621 { 3622 struct vcpu_svm *svm = to_svm(vcpu); 3623 3624 /* 3625 * Flush only the current ASID even if the TLB flush was invoked via 3626 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all 3627 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and 3628 * unconditionally does a TLB flush on both nested VM-Enter and nested 3629 * VM-Exit (via kvm_mmu_reset_context()). 3630 */ 3631 if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) 3632 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 3633 else 3634 svm->current_vmcb->asid_generation--; 3635 } 3636 3637 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) 3638 { 3639 struct vcpu_svm *svm = to_svm(vcpu); 3640 3641 invlpga(gva, svm->vmcb->control.asid); 3642 } 3643 3644 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) 3645 { 3646 struct vcpu_svm *svm = to_svm(vcpu); 3647 3648 if (nested_svm_virtualize_tpr(vcpu)) 3649 return; 3650 3651 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { 3652 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3653 kvm_set_cr8(vcpu, cr8); 3654 } 3655 } 3656 3657 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) 3658 { 3659 struct vcpu_svm *svm = to_svm(vcpu); 3660 u64 cr8; 3661 3662 if (nested_svm_virtualize_tpr(vcpu) || 3663 kvm_vcpu_apicv_active(vcpu)) 3664 return; 3665 3666 cr8 = kvm_get_cr8(vcpu); 3667 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 3668 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 3669 } 3670 3671 static void svm_complete_interrupts(struct kvm_vcpu *vcpu) 3672 { 3673 struct vcpu_svm *svm = to_svm(vcpu); 3674 u8 vector; 3675 int type; 3676 u32 exitintinfo = svm->vmcb->control.exit_int_info; 3677 unsigned int3_injected = svm->int3_injected; 3678 3679 svm->int3_injected = 0; 3680 3681 /* 3682 * If we've made progress since setting HF_IRET_MASK, we've 3683 * executed an IRET and can allow NMI injection. 3684 */ 3685 if ((vcpu->arch.hflags & HF_IRET_MASK) && 3686 (sev_es_guest(vcpu->kvm) || 3687 kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { 3688 vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3689 kvm_make_request(KVM_REQ_EVENT, vcpu); 3690 } 3691 3692 vcpu->arch.nmi_injected = false; 3693 kvm_clear_exception_queue(vcpu); 3694 kvm_clear_interrupt_queue(vcpu); 3695 3696 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 3697 return; 3698 3699 kvm_make_request(KVM_REQ_EVENT, vcpu); 3700 3701 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 3702 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 3703 3704 switch (type) { 3705 case SVM_EXITINTINFO_TYPE_NMI: 3706 vcpu->arch.nmi_injected = true; 3707 break; 3708 case SVM_EXITINTINFO_TYPE_EXEPT: 3709 /* 3710 * Never re-inject a #VC exception. 3711 */ 3712 if (vector == X86_TRAP_VC) 3713 break; 3714 3715 /* 3716 * In case of software exceptions, do not reinject the vector, 3717 * but re-execute the instruction instead. Rewind RIP first 3718 * if we emulated INT3 before. 3719 */ 3720 if (kvm_exception_is_soft(vector)) { 3721 if (vector == BP_VECTOR && int3_injected && 3722 kvm_is_linear_rip(vcpu, svm->int3_rip)) 3723 kvm_rip_write(vcpu, 3724 kvm_rip_read(vcpu) - int3_injected); 3725 break; 3726 } 3727 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 3728 u32 err = svm->vmcb->control.exit_int_info_err; 3729 kvm_requeue_exception_e(vcpu, vector, err); 3730 3731 } else 3732 kvm_requeue_exception(vcpu, vector); 3733 break; 3734 case SVM_EXITINTINFO_TYPE_INTR: 3735 kvm_queue_interrupt(vcpu, vector, false); 3736 break; 3737 default: 3738 break; 3739 } 3740 } 3741 3742 static void svm_cancel_injection(struct kvm_vcpu *vcpu) 3743 { 3744 struct vcpu_svm *svm = to_svm(vcpu); 3745 struct vmcb_control_area *control = &svm->vmcb->control; 3746 3747 control->exit_int_info = control->event_inj; 3748 control->exit_int_info_err = control->event_inj_err; 3749 control->event_inj = 0; 3750 svm_complete_interrupts(vcpu); 3751 } 3752 3753 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) 3754 { 3755 return 1; 3756 } 3757 3758 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 3759 { 3760 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && 3761 to_svm(vcpu)->vmcb->control.exit_info_1) 3762 return handle_fastpath_set_msr_irqoff(vcpu); 3763 3764 return EXIT_FASTPATH_NONE; 3765 } 3766 3767 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) 3768 { 3769 struct vcpu_svm *svm = to_svm(vcpu); 3770 unsigned long vmcb_pa = svm->current_vmcb->pa; 3771 3772 guest_state_enter_irqoff(); 3773 3774 if (sev_es_guest(vcpu->kvm)) { 3775 __svm_sev_es_vcpu_run(vmcb_pa); 3776 } else { 3777 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3778 3779 /* 3780 * Use a single vmcb (vmcb01 because it's always valid) for 3781 * context switching guest state via VMLOAD/VMSAVE, that way 3782 * the state doesn't need to be copied between vmcb01 and 3783 * vmcb02 when switching vmcbs for nested virtualization. 3784 */ 3785 vmload(svm->vmcb01.pa); 3786 __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs); 3787 vmsave(svm->vmcb01.pa); 3788 3789 vmload(__sme_page_pa(sd->save_area)); 3790 } 3791 3792 guest_state_exit_irqoff(); 3793 } 3794 3795 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) 3796 { 3797 struct vcpu_svm *svm = to_svm(vcpu); 3798 3799 trace_kvm_entry(vcpu); 3800 3801 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3802 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3803 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 3804 3805 /* 3806 * Disable singlestep if we're injecting an interrupt/exception. 3807 * We don't want our modified rflags to be pushed on the stack where 3808 * we might not be able to easily reset them if we disabled NMI 3809 * singlestep later. 3810 */ 3811 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { 3812 /* 3813 * Event injection happens before external interrupts cause a 3814 * vmexit and interrupts are disabled here, so smp_send_reschedule 3815 * is enough to force an immediate vmexit. 3816 */ 3817 disable_nmi_singlestep(svm); 3818 smp_send_reschedule(vcpu->cpu); 3819 } 3820 3821 pre_svm_run(vcpu); 3822 3823 sync_lapic_to_cr8(vcpu); 3824 3825 if (unlikely(svm->asid != svm->vmcb->control.asid)) { 3826 svm->vmcb->control.asid = svm->asid; 3827 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 3828 } 3829 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3830 3831 svm_hv_update_vp_id(svm->vmcb, vcpu); 3832 3833 /* 3834 * Run with all-zero DR6 unless needed, so that we can get the exact cause 3835 * of a #DB. 3836 */ 3837 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) 3838 svm_set_dr6(svm, vcpu->arch.dr6); 3839 else 3840 svm_set_dr6(svm, DR6_ACTIVE_LOW); 3841 3842 clgi(); 3843 kvm_load_guest_xsave_state(vcpu); 3844 3845 kvm_wait_lapic_expire(vcpu); 3846 3847 /* 3848 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 3849 * it's non-zero. Since vmentry is serialising on affected CPUs, there 3850 * is no need to worry about the conditional branch over the wrmsr 3851 * being speculatively taken. 3852 */ 3853 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 3854 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); 3855 3856 svm_vcpu_enter_exit(vcpu); 3857 3858 /* 3859 * We do not use IBRS in the kernel. If this vCPU has used the 3860 * SPEC_CTRL MSR it may have left it on; save the value and 3861 * turn it off. This is much more efficient than blindly adding 3862 * it to the atomic save/restore list. Especially as the former 3863 * (Saving guest MSRs on vmexit) doesn't even exist in KVM. 3864 * 3865 * For non-nested case: 3866 * If the L01 MSR bitmap does not intercept the MSR, then we need to 3867 * save it. 3868 * 3869 * For nested case: 3870 * If the L02 MSR bitmap does not intercept the MSR, then we need to 3871 * save it. 3872 */ 3873 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) && 3874 unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) 3875 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); 3876 3877 if (!sev_es_guest(vcpu->kvm)) 3878 reload_tss(vcpu); 3879 3880 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 3881 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); 3882 3883 if (!sev_es_guest(vcpu->kvm)) { 3884 vcpu->arch.cr2 = svm->vmcb->save.cr2; 3885 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 3886 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3887 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3888 } 3889 vcpu->arch.regs_dirty = 0; 3890 3891 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 3892 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 3893 3894 kvm_load_host_xsave_state(vcpu); 3895 stgi(); 3896 3897 /* Any pending NMI will happen here */ 3898 3899 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 3900 kvm_after_interrupt(vcpu); 3901 3902 sync_cr8_to_lapic(vcpu); 3903 3904 svm->next_rip = 0; 3905 if (is_guest_mode(vcpu)) { 3906 nested_sync_control_from_vmcb02(svm); 3907 3908 /* Track VMRUNs that have made past consistency checking */ 3909 if (svm->nested.nested_run_pending && 3910 svm->vmcb->control.exit_code != SVM_EXIT_ERR) 3911 ++vcpu->stat.nested_run; 3912 3913 svm->nested.nested_run_pending = 0; 3914 } 3915 3916 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 3917 vmcb_mark_all_clean(svm->vmcb); 3918 3919 /* if exit due to PF check for async PF */ 3920 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) 3921 vcpu->arch.apf.host_apf_flags = 3922 kvm_read_and_reset_apf_flags(); 3923 3924 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; 3925 3926 /* 3927 * We need to handle MC intercepts here before the vcpu has a chance to 3928 * change the physical cpu 3929 */ 3930 if (unlikely(svm->vmcb->control.exit_code == 3931 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 3932 svm_handle_mce(vcpu); 3933 3934 svm_complete_interrupts(vcpu); 3935 3936 if (is_guest_mode(vcpu)) 3937 return EXIT_FASTPATH_NONE; 3938 3939 return svm_exit_handlers_fastpath(vcpu); 3940 } 3941 3942 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, 3943 int root_level) 3944 { 3945 struct vcpu_svm *svm = to_svm(vcpu); 3946 unsigned long cr3; 3947 3948 if (npt_enabled) { 3949 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); 3950 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 3951 3952 hv_track_root_tdp(vcpu, root_hpa); 3953 3954 cr3 = vcpu->arch.cr3; 3955 } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) { 3956 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); 3957 } else { 3958 /* PCID in the guest should be impossible with a 32-bit MMU. */ 3959 WARN_ON_ONCE(kvm_get_active_pcid(vcpu)); 3960 cr3 = root_hpa; 3961 } 3962 3963 svm->vmcb->save.cr3 = cr3; 3964 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 3965 } 3966 3967 static int is_disabled(void) 3968 { 3969 u64 vm_cr; 3970 3971 rdmsrl(MSR_VM_CR, vm_cr); 3972 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) 3973 return 1; 3974 3975 return 0; 3976 } 3977 3978 static void 3979 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 3980 { 3981 /* 3982 * Patch in the VMMCALL instruction: 3983 */ 3984 hypercall[0] = 0x0f; 3985 hypercall[1] = 0x01; 3986 hypercall[2] = 0xd9; 3987 } 3988 3989 static int __init svm_check_processor_compat(void) 3990 { 3991 return 0; 3992 } 3993 3994 /* 3995 * The kvm parameter can be NULL (module initialization, or invocation before 3996 * VM creation). Be sure to check the kvm parameter before using it. 3997 */ 3998 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) 3999 { 4000 switch (index) { 4001 case MSR_IA32_MCG_EXT_CTL: 4002 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 4003 return false; 4004 case MSR_IA32_SMBASE: 4005 /* SEV-ES guests do not support SMM, so report false */ 4006 if (kvm && sev_es_guest(kvm)) 4007 return false; 4008 break; 4009 default: 4010 break; 4011 } 4012 4013 return true; 4014 } 4015 4016 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 4017 { 4018 return 0; 4019 } 4020 4021 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 4022 { 4023 struct vcpu_svm *svm = to_svm(vcpu); 4024 struct kvm_cpuid_entry2 *best; 4025 struct kvm *kvm = vcpu->kvm; 4026 4027 vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 4028 boot_cpu_has(X86_FEATURE_XSAVE) && 4029 boot_cpu_has(X86_FEATURE_XSAVES); 4030 4031 /* Update nrips enabled cache */ 4032 svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && 4033 guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); 4034 4035 svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); 4036 svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV); 4037 4038 svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); 4039 4040 svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) && 4041 guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER); 4042 4043 svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) && 4044 guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD); 4045 4046 svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF); 4047 4048 svm_recalc_instruction_intercepts(vcpu, svm); 4049 4050 /* For sev guests, the memory encryption bit is not reserved in CR3. */ 4051 if (sev_guest(vcpu->kvm)) { 4052 best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); 4053 if (best) 4054 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); 4055 } 4056 4057 if (kvm_vcpu_apicv_active(vcpu)) { 4058 /* 4059 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature 4060 * is exposed to the guest, disable AVIC. 4061 */ 4062 if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) 4063 kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); 4064 } 4065 init_vmcb_after_set_cpuid(vcpu); 4066 } 4067 4068 static bool svm_has_wbinvd_exit(void) 4069 { 4070 return true; 4071 } 4072 4073 #define PRE_EX(exit) { .exit_code = (exit), \ 4074 .stage = X86_ICPT_PRE_EXCEPT, } 4075 #define POST_EX(exit) { .exit_code = (exit), \ 4076 .stage = X86_ICPT_POST_EXCEPT, } 4077 #define POST_MEM(exit) { .exit_code = (exit), \ 4078 .stage = X86_ICPT_POST_MEMACCESS, } 4079 4080 static const struct __x86_intercept { 4081 u32 exit_code; 4082 enum x86_intercept_stage stage; 4083 } x86_intercept_map[] = { 4084 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), 4085 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), 4086 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), 4087 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), 4088 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), 4089 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), 4090 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), 4091 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), 4092 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), 4093 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), 4094 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), 4095 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), 4096 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), 4097 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), 4098 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), 4099 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), 4100 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), 4101 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), 4102 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), 4103 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), 4104 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), 4105 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), 4106 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), 4107 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), 4108 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), 4109 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), 4110 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), 4111 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), 4112 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), 4113 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), 4114 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), 4115 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), 4116 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), 4117 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), 4118 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), 4119 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), 4120 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), 4121 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), 4122 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), 4123 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), 4124 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), 4125 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), 4126 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), 4127 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), 4128 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), 4129 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), 4130 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), 4131 }; 4132 4133 #undef PRE_EX 4134 #undef POST_EX 4135 #undef POST_MEM 4136 4137 static int svm_check_intercept(struct kvm_vcpu *vcpu, 4138 struct x86_instruction_info *info, 4139 enum x86_intercept_stage stage, 4140 struct x86_exception *exception) 4141 { 4142 struct vcpu_svm *svm = to_svm(vcpu); 4143 int vmexit, ret = X86EMUL_CONTINUE; 4144 struct __x86_intercept icpt_info; 4145 struct vmcb *vmcb = svm->vmcb; 4146 4147 if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) 4148 goto out; 4149 4150 icpt_info = x86_intercept_map[info->intercept]; 4151 4152 if (stage != icpt_info.stage) 4153 goto out; 4154 4155 switch (icpt_info.exit_code) { 4156 case SVM_EXIT_READ_CR0: 4157 if (info->intercept == x86_intercept_cr_read) 4158 icpt_info.exit_code += info->modrm_reg; 4159 break; 4160 case SVM_EXIT_WRITE_CR0: { 4161 unsigned long cr0, val; 4162 4163 if (info->intercept == x86_intercept_cr_write) 4164 icpt_info.exit_code += info->modrm_reg; 4165 4166 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || 4167 info->intercept == x86_intercept_clts) 4168 break; 4169 4170 if (!(vmcb12_is_intercept(&svm->nested.ctl, 4171 INTERCEPT_SELECTIVE_CR0))) 4172 break; 4173 4174 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; 4175 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; 4176 4177 if (info->intercept == x86_intercept_lmsw) { 4178 cr0 &= 0xfUL; 4179 val &= 0xfUL; 4180 /* lmsw can't clear PE - catch this here */ 4181 if (cr0 & X86_CR0_PE) 4182 val |= X86_CR0_PE; 4183 } 4184 4185 if (cr0 ^ val) 4186 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; 4187 4188 break; 4189 } 4190 case SVM_EXIT_READ_DR0: 4191 case SVM_EXIT_WRITE_DR0: 4192 icpt_info.exit_code += info->modrm_reg; 4193 break; 4194 case SVM_EXIT_MSR: 4195 if (info->intercept == x86_intercept_wrmsr) 4196 vmcb->control.exit_info_1 = 1; 4197 else 4198 vmcb->control.exit_info_1 = 0; 4199 break; 4200 case SVM_EXIT_PAUSE: 4201 /* 4202 * We get this for NOP only, but pause 4203 * is rep not, check this here 4204 */ 4205 if (info->rep_prefix != REPE_PREFIX) 4206 goto out; 4207 break; 4208 case SVM_EXIT_IOIO: { 4209 u64 exit_info; 4210 u32 bytes; 4211 4212 if (info->intercept == x86_intercept_in || 4213 info->intercept == x86_intercept_ins) { 4214 exit_info = ((info->src_val & 0xffff) << 16) | 4215 SVM_IOIO_TYPE_MASK; 4216 bytes = info->dst_bytes; 4217 } else { 4218 exit_info = (info->dst_val & 0xffff) << 16; 4219 bytes = info->src_bytes; 4220 } 4221 4222 if (info->intercept == x86_intercept_outs || 4223 info->intercept == x86_intercept_ins) 4224 exit_info |= SVM_IOIO_STR_MASK; 4225 4226 if (info->rep_prefix) 4227 exit_info |= SVM_IOIO_REP_MASK; 4228 4229 bytes = min(bytes, 4u); 4230 4231 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; 4232 4233 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); 4234 4235 vmcb->control.exit_info_1 = exit_info; 4236 vmcb->control.exit_info_2 = info->next_rip; 4237 4238 break; 4239 } 4240 default: 4241 break; 4242 } 4243 4244 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */ 4245 if (static_cpu_has(X86_FEATURE_NRIPS)) 4246 vmcb->control.next_rip = info->next_rip; 4247 vmcb->control.exit_code = icpt_info.exit_code; 4248 vmexit = nested_svm_exit_handled(svm); 4249 4250 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED 4251 : X86EMUL_CONTINUE; 4252 4253 out: 4254 return ret; 4255 } 4256 4257 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) 4258 { 4259 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) 4260 vcpu->arch.at_instruction_boundary = true; 4261 } 4262 4263 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) 4264 { 4265 if (!kvm_pause_in_guest(vcpu->kvm)) 4266 shrink_ple_window(vcpu); 4267 } 4268 4269 static void svm_setup_mce(struct kvm_vcpu *vcpu) 4270 { 4271 /* [63:9] are reserved. */ 4272 vcpu->arch.mcg_cap &= 0x1ff; 4273 } 4274 4275 bool svm_smi_blocked(struct kvm_vcpu *vcpu) 4276 { 4277 struct vcpu_svm *svm = to_svm(vcpu); 4278 4279 /* Per APM Vol.2 15.22.2 "Response to SMI" */ 4280 if (!gif_set(svm)) 4281 return true; 4282 4283 return is_smm(vcpu); 4284 } 4285 4286 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4287 { 4288 struct vcpu_svm *svm = to_svm(vcpu); 4289 if (svm->nested.nested_run_pending) 4290 return -EBUSY; 4291 4292 if (svm_smi_blocked(vcpu)) 4293 return 0; 4294 4295 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ 4296 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) 4297 return -EBUSY; 4298 4299 return 1; 4300 } 4301 4302 static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) 4303 { 4304 struct vcpu_svm *svm = to_svm(vcpu); 4305 struct kvm_host_map map_save; 4306 int ret; 4307 4308 if (!is_guest_mode(vcpu)) 4309 return 0; 4310 4311 /* FED8h - SVM Guest */ 4312 put_smstate(u64, smstate, 0x7ed8, 1); 4313 /* FEE0h - SVM Guest VMCB Physical Address */ 4314 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); 4315 4316 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4317 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4318 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4319 4320 ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); 4321 if (ret) 4322 return ret; 4323 4324 /* 4325 * KVM uses VMCB01 to store L1 host state while L2 runs but 4326 * VMCB01 is going to be used during SMM and thus the state will 4327 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save 4328 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the 4329 * format of the area is identical to guest save area offsetted 4330 * by 0x400 (matches the offset of 'struct vmcb_save_area' 4331 * within 'struct vmcb'). Note: HSAVE area may also be used by 4332 * L1 hypervisor to save additional host context (e.g. KVM does 4333 * that, see svm_prepare_switch_to_guest()) which must be 4334 * preserved. 4335 */ 4336 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), 4337 &map_save) == -EINVAL) 4338 return 1; 4339 4340 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); 4341 4342 svm_copy_vmrun_state(map_save.hva + 0x400, 4343 &svm->vmcb01.ptr->save); 4344 4345 kvm_vcpu_unmap(vcpu, &map_save, true); 4346 return 0; 4347 } 4348 4349 static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) 4350 { 4351 struct vcpu_svm *svm = to_svm(vcpu); 4352 struct kvm_host_map map, map_save; 4353 u64 saved_efer, vmcb12_gpa; 4354 struct vmcb *vmcb12; 4355 int ret; 4356 4357 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) 4358 return 0; 4359 4360 /* Non-zero if SMI arrived while vCPU was in guest mode. */ 4361 if (!GET_SMSTATE(u64, smstate, 0x7ed8)) 4362 return 0; 4363 4364 if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 4365 return 1; 4366 4367 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); 4368 if (!(saved_efer & EFER_SVME)) 4369 return 1; 4370 4371 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); 4372 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) 4373 return 1; 4374 4375 ret = 1; 4376 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) 4377 goto unmap_map; 4378 4379 if (svm_allocate_nested(svm)) 4380 goto unmap_save; 4381 4382 /* 4383 * Restore L1 host state from L1 HSAVE area as VMCB01 was 4384 * used during SMM (see svm_enter_smm()) 4385 */ 4386 4387 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); 4388 4389 /* 4390 * Enter the nested guest now 4391 */ 4392 4393 vmcb_mark_all_dirty(svm->vmcb01.ptr); 4394 4395 vmcb12 = map.hva; 4396 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); 4397 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); 4398 ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); 4399 4400 if (ret) 4401 goto unmap_save; 4402 4403 svm->nested.nested_run_pending = 1; 4404 4405 unmap_save: 4406 kvm_vcpu_unmap(vcpu, &map_save, true); 4407 unmap_map: 4408 kvm_vcpu_unmap(vcpu, &map, true); 4409 return ret; 4410 } 4411 4412 static void svm_enable_smi_window(struct kvm_vcpu *vcpu) 4413 { 4414 struct vcpu_svm *svm = to_svm(vcpu); 4415 4416 if (!gif_set(svm)) { 4417 if (vgif) 4418 svm_set_intercept(svm, INTERCEPT_STGI); 4419 /* STGI will cause a vm exit */ 4420 } else { 4421 /* We must be in SMM; RSM will cause a vmexit anyway. */ 4422 } 4423 } 4424 4425 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 4426 void *insn, int insn_len) 4427 { 4428 bool smep, smap, is_user; 4429 unsigned long cr4; 4430 u64 error_code; 4431 4432 /* Emulation is always possible when KVM has access to all guest state. */ 4433 if (!sev_guest(vcpu->kvm)) 4434 return true; 4435 4436 /* #UD and #GP should never be intercepted for SEV guests. */ 4437 WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | 4438 EMULTYPE_TRAP_UD_FORCED | 4439 EMULTYPE_VMWARE_GP)); 4440 4441 /* 4442 * Emulation is impossible for SEV-ES guests as KVM doesn't have access 4443 * to guest register state. 4444 */ 4445 if (sev_es_guest(vcpu->kvm)) 4446 return false; 4447 4448 /* 4449 * Emulation is possible if the instruction is already decoded, e.g. 4450 * when completing I/O after returning from userspace. 4451 */ 4452 if (emul_type & EMULTYPE_NO_DECODE) 4453 return true; 4454 4455 /* 4456 * Emulation is possible for SEV guests if and only if a prefilled 4457 * buffer containing the bytes of the intercepted instruction is 4458 * available. SEV guest memory is encrypted with a guest specific key 4459 * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and 4460 * decode garbage. 4461 * 4462 * Inject #UD if KVM reached this point without an instruction buffer. 4463 * In practice, this path should never be hit by a well-behaved guest, 4464 * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path 4465 * is still theoretically reachable, e.g. via unaccelerated fault-like 4466 * AVIC access, and needs to be handled by KVM to avoid putting the 4467 * guest into an infinite loop. Injecting #UD is somewhat arbitrary, 4468 * but its the least awful option given lack of insight into the guest. 4469 */ 4470 if (unlikely(!insn)) { 4471 kvm_queue_exception(vcpu, UD_VECTOR); 4472 return false; 4473 } 4474 4475 /* 4476 * Emulate for SEV guests if the insn buffer is not empty. The buffer 4477 * will be empty if the DecodeAssist microcode cannot fetch bytes for 4478 * the faulting instruction because the code fetch itself faulted, e.g. 4479 * the guest attempted to fetch from emulated MMIO or a guest page 4480 * table used to translate CS:RIP resides in emulated MMIO. 4481 */ 4482 if (likely(insn_len)) 4483 return true; 4484 4485 /* 4486 * Detect and workaround Errata 1096 Fam_17h_00_0Fh. 4487 * 4488 * Errata: 4489 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is 4490 * possible that CPU microcode implementing DecodeAssist will fail to 4491 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly 4492 * be '0'. This happens because microcode reads CS:RIP using a _data_ 4493 * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode 4494 * gives up and does not fill the instruction bytes buffer. 4495 * 4496 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU 4497 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler 4498 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the 4499 * GuestIntrBytes field of the VMCB. 4500 * 4501 * This does _not_ mean that the erratum has been encountered, as the 4502 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate 4503 * #PF, e.g. if the guest attempt to execute from emulated MMIO and 4504 * encountered a reserved/not-present #PF. 4505 * 4506 * To hit the erratum, the following conditions must be true: 4507 * 1. CR4.SMAP=1 (obviously). 4508 * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot 4509 * have been hit as the guest would have encountered a SMEP 4510 * violation #PF, not a #NPF. 4511 * 3. The #NPF is not due to a code fetch, in which case failure to 4512 * retrieve the instruction bytes is legitimate (see abvoe). 4513 * 4514 * In addition, don't apply the erratum workaround if the #NPF occurred 4515 * while translating guest page tables (see below). 4516 */ 4517 error_code = to_svm(vcpu)->vmcb->control.exit_info_1; 4518 if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) 4519 goto resume_guest; 4520 4521 cr4 = kvm_read_cr4(vcpu); 4522 smep = cr4 & X86_CR4_SMEP; 4523 smap = cr4 & X86_CR4_SMAP; 4524 is_user = svm_get_cpl(vcpu) == 3; 4525 if (smap && (!smep || is_user)) { 4526 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); 4527 4528 /* 4529 * If the fault occurred in userspace, arbitrarily inject #GP 4530 * to avoid killing the guest and to hopefully avoid confusing 4531 * the guest kernel too much, e.g. injecting #PF would not be 4532 * coherent with respect to the guest's page tables. Request 4533 * triple fault if the fault occurred in the kernel as there's 4534 * no fault that KVM can inject without confusing the guest. 4535 * In practice, the triple fault is moot as no sane SEV kernel 4536 * will execute from user memory while also running with SMAP=1. 4537 */ 4538 if (is_user) 4539 kvm_inject_gp(vcpu, 0); 4540 else 4541 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4542 } 4543 4544 resume_guest: 4545 /* 4546 * If the erratum was not hit, simply resume the guest and let it fault 4547 * again. While awful, e.g. the vCPU may get stuck in an infinite loop 4548 * if the fault is at CPL=0, it's the lesser of all evils. Exiting to 4549 * userspace will kill the guest, and letting the emulator read garbage 4550 * will yield random behavior and potentially corrupt the guest. 4551 * 4552 * Simply resuming the guest is technically not a violation of the SEV 4553 * architecture. AMD's APM states that all code fetches and page table 4554 * accesses for SEV guest are encrypted, regardless of the C-Bit. The 4555 * APM also states that encrypted accesses to MMIO are "ignored", but 4556 * doesn't explicitly define "ignored", i.e. doing nothing and letting 4557 * the guest spin is technically "ignoring" the access. 4558 */ 4559 return false; 4560 } 4561 4562 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 4563 { 4564 struct vcpu_svm *svm = to_svm(vcpu); 4565 4566 /* 4567 * TODO: Last condition latch INIT signals on vCPU when 4568 * vCPU is in guest-mode and vmcb12 defines intercept on INIT. 4569 * To properly emulate the INIT intercept, 4570 * svm_check_nested_events() should call nested_svm_vmexit() 4571 * if an INIT signal is pending. 4572 */ 4573 return !gif_set(svm) || 4574 (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); 4575 } 4576 4577 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 4578 { 4579 if (!sev_es_guest(vcpu->kvm)) 4580 return kvm_vcpu_deliver_sipi_vector(vcpu, vector); 4581 4582 sev_vcpu_deliver_sipi_vector(vcpu, vector); 4583 } 4584 4585 static void svm_vm_destroy(struct kvm *kvm) 4586 { 4587 avic_vm_destroy(kvm); 4588 sev_vm_destroy(kvm); 4589 } 4590 4591 static int svm_vm_init(struct kvm *kvm) 4592 { 4593 if (!pause_filter_count || !pause_filter_thresh) 4594 kvm->arch.pause_in_guest = true; 4595 4596 if (enable_apicv) { 4597 int ret = avic_vm_init(kvm); 4598 if (ret) 4599 return ret; 4600 } 4601 4602 return 0; 4603 } 4604 4605 static struct kvm_x86_ops svm_x86_ops __initdata = { 4606 .name = "kvm_amd", 4607 4608 .hardware_unsetup = svm_hardware_unsetup, 4609 .hardware_enable = svm_hardware_enable, 4610 .hardware_disable = svm_hardware_disable, 4611 .has_emulated_msr = svm_has_emulated_msr, 4612 4613 .vcpu_create = svm_vcpu_create, 4614 .vcpu_free = svm_vcpu_free, 4615 .vcpu_reset = svm_vcpu_reset, 4616 4617 .vm_size = sizeof(struct kvm_svm), 4618 .vm_init = svm_vm_init, 4619 .vm_destroy = svm_vm_destroy, 4620 4621 .prepare_switch_to_guest = svm_prepare_switch_to_guest, 4622 .vcpu_load = svm_vcpu_load, 4623 .vcpu_put = svm_vcpu_put, 4624 .vcpu_blocking = avic_vcpu_blocking, 4625 .vcpu_unblocking = avic_vcpu_unblocking, 4626 4627 .update_exception_bitmap = svm_update_exception_bitmap, 4628 .get_msr_feature = svm_get_msr_feature, 4629 .get_msr = svm_get_msr, 4630 .set_msr = svm_set_msr, 4631 .get_segment_base = svm_get_segment_base, 4632 .get_segment = svm_get_segment, 4633 .set_segment = svm_set_segment, 4634 .get_cpl = svm_get_cpl, 4635 .get_cs_db_l_bits = svm_get_cs_db_l_bits, 4636 .set_cr0 = svm_set_cr0, 4637 .post_set_cr3 = sev_post_set_cr3, 4638 .is_valid_cr4 = svm_is_valid_cr4, 4639 .set_cr4 = svm_set_cr4, 4640 .set_efer = svm_set_efer, 4641 .get_idt = svm_get_idt, 4642 .set_idt = svm_set_idt, 4643 .get_gdt = svm_get_gdt, 4644 .set_gdt = svm_set_gdt, 4645 .set_dr7 = svm_set_dr7, 4646 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, 4647 .cache_reg = svm_cache_reg, 4648 .get_rflags = svm_get_rflags, 4649 .set_rflags = svm_set_rflags, 4650 .get_if_flag = svm_get_if_flag, 4651 4652 .flush_tlb_all = svm_flush_tlb_current, 4653 .flush_tlb_current = svm_flush_tlb_current, 4654 .flush_tlb_gva = svm_flush_tlb_gva, 4655 .flush_tlb_guest = svm_flush_tlb_current, 4656 4657 .vcpu_pre_run = svm_vcpu_pre_run, 4658 .vcpu_run = svm_vcpu_run, 4659 .handle_exit = svm_handle_exit, 4660 .skip_emulated_instruction = svm_skip_emulated_instruction, 4661 .update_emulated_instruction = NULL, 4662 .set_interrupt_shadow = svm_set_interrupt_shadow, 4663 .get_interrupt_shadow = svm_get_interrupt_shadow, 4664 .patch_hypercall = svm_patch_hypercall, 4665 .inject_irq = svm_inject_irq, 4666 .inject_nmi = svm_inject_nmi, 4667 .queue_exception = svm_queue_exception, 4668 .cancel_injection = svm_cancel_injection, 4669 .interrupt_allowed = svm_interrupt_allowed, 4670 .nmi_allowed = svm_nmi_allowed, 4671 .get_nmi_mask = svm_get_nmi_mask, 4672 .set_nmi_mask = svm_set_nmi_mask, 4673 .enable_nmi_window = svm_enable_nmi_window, 4674 .enable_irq_window = svm_enable_irq_window, 4675 .update_cr8_intercept = svm_update_cr8_intercept, 4676 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, 4677 .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, 4678 .apicv_post_state_restore = avic_apicv_post_state_restore, 4679 4680 .get_mt_mask = svm_get_mt_mask, 4681 .get_exit_info = svm_get_exit_info, 4682 4683 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, 4684 4685 .has_wbinvd_exit = svm_has_wbinvd_exit, 4686 4687 .get_l2_tsc_offset = svm_get_l2_tsc_offset, 4688 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, 4689 .write_tsc_offset = svm_write_tsc_offset, 4690 .write_tsc_multiplier = svm_write_tsc_multiplier, 4691 4692 .load_mmu_pgd = svm_load_mmu_pgd, 4693 4694 .check_intercept = svm_check_intercept, 4695 .handle_exit_irqoff = svm_handle_exit_irqoff, 4696 4697 .request_immediate_exit = __kvm_request_immediate_exit, 4698 4699 .sched_in = svm_sched_in, 4700 4701 .nested_ops = &svm_nested_ops, 4702 4703 .deliver_interrupt = svm_deliver_interrupt, 4704 .pi_update_irte = avic_pi_update_irte, 4705 .setup_mce = svm_setup_mce, 4706 4707 .smi_allowed = svm_smi_allowed, 4708 .enter_smm = svm_enter_smm, 4709 .leave_smm = svm_leave_smm, 4710 .enable_smi_window = svm_enable_smi_window, 4711 4712 .mem_enc_ioctl = sev_mem_enc_ioctl, 4713 .mem_enc_register_region = sev_mem_enc_register_region, 4714 .mem_enc_unregister_region = sev_mem_enc_unregister_region, 4715 .guest_memory_reclaimed = sev_guest_memory_reclaimed, 4716 4717 .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, 4718 .vm_move_enc_context_from = sev_vm_move_enc_context_from, 4719 4720 .can_emulate_instruction = svm_can_emulate_instruction, 4721 4722 .apic_init_signal_blocked = svm_apic_init_signal_blocked, 4723 4724 .msr_filter_changed = svm_msr_filter_changed, 4725 .complete_emulated_msr = svm_complete_emulated_msr, 4726 4727 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, 4728 .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, 4729 }; 4730 4731 /* 4732 * The default MMIO mask is a single bit (excluding the present bit), 4733 * which could conflict with the memory encryption bit. Check for 4734 * memory encryption support and override the default MMIO mask if 4735 * memory encryption is enabled. 4736 */ 4737 static __init void svm_adjust_mmio_mask(void) 4738 { 4739 unsigned int enc_bit, mask_bit; 4740 u64 msr, mask; 4741 4742 /* If there is no memory encryption support, use existing mask */ 4743 if (cpuid_eax(0x80000000) < 0x8000001f) 4744 return; 4745 4746 /* If memory encryption is not enabled, use existing mask */ 4747 rdmsrl(MSR_AMD64_SYSCFG, msr); 4748 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) 4749 return; 4750 4751 enc_bit = cpuid_ebx(0x8000001f) & 0x3f; 4752 mask_bit = boot_cpu_data.x86_phys_bits; 4753 4754 /* Increment the mask bit if it is the same as the encryption bit */ 4755 if (enc_bit == mask_bit) 4756 mask_bit++; 4757 4758 /* 4759 * If the mask bit location is below 52, then some bits above the 4760 * physical addressing limit will always be reserved, so use the 4761 * rsvd_bits() function to generate the mask. This mask, along with 4762 * the present bit, will be used to generate a page fault with 4763 * PFER.RSV = 1. 4764 * 4765 * If the mask bit location is 52 (or above), then clear the mask. 4766 */ 4767 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; 4768 4769 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); 4770 } 4771 4772 static __init void svm_set_cpu_caps(void) 4773 { 4774 kvm_set_cpu_caps(); 4775 4776 supported_xss = 0; 4777 4778 /* CPUID 0x80000001 and 0x8000000A (SVM features) */ 4779 if (nested) { 4780 kvm_cpu_cap_set(X86_FEATURE_SVM); 4781 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); 4782 4783 if (nrips) 4784 kvm_cpu_cap_set(X86_FEATURE_NRIPS); 4785 4786 if (npt_enabled) 4787 kvm_cpu_cap_set(X86_FEATURE_NPT); 4788 4789 if (tsc_scaling) 4790 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); 4791 4792 if (vls) 4793 kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD); 4794 if (lbrv) 4795 kvm_cpu_cap_set(X86_FEATURE_LBRV); 4796 4797 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) 4798 kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER); 4799 4800 if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) 4801 kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD); 4802 4803 if (vgif) 4804 kvm_cpu_cap_set(X86_FEATURE_VGIF); 4805 4806 /* Nested VM can receive #VMEXIT instead of triggering #GP */ 4807 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); 4808 } 4809 4810 /* CPUID 0x80000008 */ 4811 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 4812 boot_cpu_has(X86_FEATURE_AMD_SSBD)) 4813 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 4814 4815 /* AMD PMU PERFCTR_CORE CPUID */ 4816 if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) 4817 kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); 4818 4819 /* CPUID 0x8000001F (SME/SEV features) */ 4820 sev_set_cpu_caps(); 4821 } 4822 4823 static __init int svm_hardware_setup(void) 4824 { 4825 int cpu; 4826 struct page *iopm_pages; 4827 void *iopm_va; 4828 int r; 4829 unsigned int order = get_order(IOPM_SIZE); 4830 4831 /* 4832 * NX is required for shadow paging and for NPT if the NX huge pages 4833 * mitigation is enabled. 4834 */ 4835 if (!boot_cpu_has(X86_FEATURE_NX)) { 4836 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 4837 return -EOPNOTSUPP; 4838 } 4839 kvm_enable_efer_bits(EFER_NX); 4840 4841 iopm_pages = alloc_pages(GFP_KERNEL, order); 4842 4843 if (!iopm_pages) 4844 return -ENOMEM; 4845 4846 iopm_va = page_address(iopm_pages); 4847 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 4848 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 4849 4850 init_msrpm_offsets(); 4851 4852 supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); 4853 4854 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 4855 kvm_enable_efer_bits(EFER_FFXSR); 4856 4857 if (tsc_scaling) { 4858 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 4859 tsc_scaling = false; 4860 } else { 4861 pr_info("TSC scaling supported\n"); 4862 kvm_has_tsc_control = true; 4863 } 4864 } 4865 kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; 4866 kvm_tsc_scaling_ratio_frac_bits = 32; 4867 4868 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); 4869 4870 /* Check for pause filtering support */ 4871 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 4872 pause_filter_count = 0; 4873 pause_filter_thresh = 0; 4874 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 4875 pause_filter_thresh = 0; 4876 } 4877 4878 if (nested) { 4879 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 4880 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 4881 } 4882 4883 /* 4884 * KVM's MMU doesn't support using 2-level paging for itself, and thus 4885 * NPT isn't supported if the host is using 2-level paging since host 4886 * CR4 is unchanged on VMRUN. 4887 */ 4888 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) 4889 npt_enabled = false; 4890 4891 if (!boot_cpu_has(X86_FEATURE_NPT)) 4892 npt_enabled = false; 4893 4894 /* Force VM NPT level equal to the host's paging level */ 4895 kvm_configure_mmu(npt_enabled, get_npt_level(), 4896 get_npt_level(), PG_LEVEL_1G); 4897 pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); 4898 4899 /* Setup shadow_me_value and shadow_me_mask */ 4900 kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); 4901 4902 /* Note, SEV setup consumes npt_enabled. */ 4903 sev_hardware_setup(); 4904 4905 svm_hv_hardware_setup(); 4906 4907 svm_adjust_mmio_mask(); 4908 4909 for_each_possible_cpu(cpu) { 4910 r = svm_cpu_init(cpu); 4911 if (r) 4912 goto err; 4913 } 4914 4915 if (nrips) { 4916 if (!boot_cpu_has(X86_FEATURE_NRIPS)) 4917 nrips = false; 4918 } 4919 4920 enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic); 4921 4922 if (enable_apicv) { 4923 if (!boot_cpu_has(X86_FEATURE_AVIC)) { 4924 pr_warn("AVIC is not supported in CPUID but force enabled"); 4925 pr_warn("Your system might crash and burn"); 4926 } else 4927 pr_info("AVIC enabled\n"); 4928 4929 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 4930 } else { 4931 svm_x86_ops.vcpu_blocking = NULL; 4932 svm_x86_ops.vcpu_unblocking = NULL; 4933 svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; 4934 } 4935 4936 if (vls) { 4937 if (!npt_enabled || 4938 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 4939 !IS_ENABLED(CONFIG_X86_64)) { 4940 vls = false; 4941 } else { 4942 pr_info("Virtual VMLOAD VMSAVE supported\n"); 4943 } 4944 } 4945 4946 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) 4947 svm_gp_erratum_intercept = false; 4948 4949 if (vgif) { 4950 if (!boot_cpu_has(X86_FEATURE_VGIF)) 4951 vgif = false; 4952 else 4953 pr_info("Virtual GIF supported\n"); 4954 } 4955 4956 if (lbrv) { 4957 if (!boot_cpu_has(X86_FEATURE_LBRV)) 4958 lbrv = false; 4959 else 4960 pr_info("LBR virtualization supported\n"); 4961 } 4962 4963 if (!enable_pmu) 4964 pr_info("PMU virtualization is disabled\n"); 4965 4966 svm_set_cpu_caps(); 4967 4968 /* 4969 * It seems that on AMD processors PTE's accessed bit is 4970 * being set by the CPU hardware before the NPF vmexit. 4971 * This is not expected behaviour and our tests fail because 4972 * of it. 4973 * A workaround here is to disable support for 4974 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 4975 * In this case userspace can know if there is support using 4976 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 4977 * it 4978 * If future AMD CPU models change the behaviour described above, 4979 * this variable can be changed accordingly 4980 */ 4981 allow_smaller_maxphyaddr = !npt_enabled; 4982 4983 return 0; 4984 4985 err: 4986 svm_hardware_unsetup(); 4987 return r; 4988 } 4989 4990 4991 static struct kvm_x86_init_ops svm_init_ops __initdata = { 4992 .cpu_has_kvm_support = has_svm, 4993 .disabled_by_bios = is_disabled, 4994 .hardware_setup = svm_hardware_setup, 4995 .check_processor_compatibility = svm_check_processor_compat, 4996 4997 .runtime_ops = &svm_x86_ops, 4998 .pmu_ops = &amd_pmu_ops, 4999 }; 5000 5001 static int __init svm_init(void) 5002 { 5003 __unused_size_checks(); 5004 5005 return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), 5006 __alignof__(struct vcpu_svm), THIS_MODULE); 5007 } 5008 5009 static void __exit svm_exit(void) 5010 { 5011 kvm_exit(); 5012 } 5013 5014 module_init(svm_init) 5015 module_exit(svm_exit) 5016