1 #define pr_fmt(fmt) "SVM: " fmt 2 3 #include <linux/kvm_host.h> 4 5 #include "irq.h" 6 #include "mmu.h" 7 #include "kvm_cache_regs.h" 8 #include "x86.h" 9 #include "cpuid.h" 10 #include "pmu.h" 11 12 #include <linux/module.h> 13 #include <linux/mod_devicetable.h> 14 #include <linux/kernel.h> 15 #include <linux/vmalloc.h> 16 #include <linux/highmem.h> 17 #include <linux/amd-iommu.h> 18 #include <linux/sched.h> 19 #include <linux/trace_events.h> 20 #include <linux/slab.h> 21 #include <linux/hashtable.h> 22 #include <linux/objtool.h> 23 #include <linux/psp-sev.h> 24 #include <linux/file.h> 25 #include <linux/pagemap.h> 26 #include <linux/swap.h> 27 #include <linux/rwsem.h> 28 #include <linux/cc_platform.h> 29 30 #include <asm/apic.h> 31 #include <asm/perf_event.h> 32 #include <asm/tlbflush.h> 33 #include <asm/desc.h> 34 #include <asm/debugreg.h> 35 #include <asm/kvm_para.h> 36 #include <asm/irq_remapping.h> 37 #include <asm/spec-ctrl.h> 38 #include <asm/cpu_device_id.h> 39 #include <asm/traps.h> 40 #include <asm/fpu/api.h> 41 42 #include <asm/virtext.h> 43 #include "trace.h" 44 45 #include "svm.h" 46 #include "svm_ops.h" 47 48 #include "kvm_onhyperv.h" 49 #include "svm_onhyperv.h" 50 51 MODULE_AUTHOR("Qumranet"); 52 MODULE_LICENSE("GPL"); 53 54 #ifdef MODULE 55 static const struct x86_cpu_id svm_cpu_id[] = { 56 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL), 57 {} 58 }; 59 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); 60 #endif 61 62 #define SEG_TYPE_LDT 2 63 #define SEG_TYPE_BUSY_TSS16 3 64 65 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 66 67 static bool erratum_383_found __read_mostly; 68 69 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 70 71 /* 72 * Set osvw_len to higher value when updated Revision Guides 73 * are published and we know what the new status bits are 74 */ 75 static uint64_t osvw_len = 4, osvw_status; 76 77 static DEFINE_PER_CPU(u64, current_tsc_ratio); 78 79 static const struct svm_direct_access_msrs { 80 u32 index; /* Index of the MSR */ 81 bool always; /* True if intercept is initially cleared */ 82 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { 83 { .index = MSR_STAR, .always = true }, 84 { .index = MSR_IA32_SYSENTER_CS, .always = true }, 85 { .index = MSR_IA32_SYSENTER_EIP, .always = false }, 86 { .index = MSR_IA32_SYSENTER_ESP, .always = false }, 87 #ifdef CONFIG_X86_64 88 { .index = MSR_GS_BASE, .always = true }, 89 { .index = MSR_FS_BASE, .always = true }, 90 { .index = MSR_KERNEL_GS_BASE, .always = true }, 91 { .index = MSR_LSTAR, .always = true }, 92 { .index = MSR_CSTAR, .always = true }, 93 { .index = MSR_SYSCALL_MASK, .always = true }, 94 #endif 95 { .index = MSR_IA32_SPEC_CTRL, .always = false }, 96 { .index = MSR_IA32_PRED_CMD, .always = false }, 97 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, 98 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, 99 { .index = MSR_IA32_LASTINTFROMIP, .always = false }, 100 { .index = MSR_IA32_LASTINTTOIP, .always = false }, 101 { .index = MSR_EFER, .always = false }, 102 { .index = MSR_IA32_CR_PAT, .always = false }, 103 { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, 104 { .index = MSR_INVALID, .always = false }, 105 }; 106 107 /* 108 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 109 * pause_filter_count: On processors that support Pause filtering(indicated 110 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter 111 * count value. On VMRUN this value is loaded into an internal counter. 112 * Each time a pause instruction is executed, this counter is decremented 113 * until it reaches zero at which time a #VMEXIT is generated if pause 114 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause 115 * Intercept Filtering for more details. 116 * This also indicate if ple logic enabled. 117 * 118 * pause_filter_thresh: In addition, some processor families support advanced 119 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on 120 * the amount of time a guest is allowed to execute in a pause loop. 121 * In this mode, a 16-bit pause filter threshold field is added in the 122 * VMCB. The threshold value is a cycle count that is used to reset the 123 * pause counter. As with simple pause filtering, VMRUN loads the pause 124 * count value from VMCB into an internal counter. Then, on each pause 125 * instruction the hardware checks the elapsed number of cycles since 126 * the most recent pause instruction against the pause filter threshold. 127 * If the elapsed cycle count is greater than the pause filter threshold, 128 * then the internal pause count is reloaded from the VMCB and execution 129 * continues. If the elapsed cycle count is less than the pause filter 130 * threshold, then the internal pause count is decremented. If the count 131 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is 132 * triggered. If advanced pause filtering is supported and pause filter 133 * threshold field is set to zero, the filter will operate in the simpler, 134 * count only mode. 135 */ 136 137 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; 138 module_param(pause_filter_thresh, ushort, 0444); 139 140 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; 141 module_param(pause_filter_count, ushort, 0444); 142 143 /* Default doubles per-vcpu window every exit. */ 144 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 145 module_param(pause_filter_count_grow, ushort, 0444); 146 147 /* Default resets per-vcpu window every exit to pause_filter_count. */ 148 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 149 module_param(pause_filter_count_shrink, ushort, 0444); 150 151 /* Default is to compute the maximum so we can never overflow. */ 152 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; 153 module_param(pause_filter_count_max, ushort, 0444); 154 155 /* 156 * Use nested page tables by default. Note, NPT may get forced off by 157 * svm_hardware_setup() if it's unsupported by hardware or the host kernel. 158 */ 159 bool npt_enabled = true; 160 module_param_named(npt, npt_enabled, bool, 0444); 161 162 /* allow nested virtualization in KVM/SVM */ 163 static int nested = true; 164 module_param(nested, int, S_IRUGO); 165 166 /* enable/disable Next RIP Save */ 167 static int nrips = true; 168 module_param(nrips, int, 0444); 169 170 /* enable/disable Virtual VMLOAD VMSAVE */ 171 static int vls = true; 172 module_param(vls, int, 0444); 173 174 /* enable/disable Virtual GIF */ 175 static int vgif = true; 176 module_param(vgif, int, 0444); 177 178 /* enable/disable LBR virtualization */ 179 static int lbrv = true; 180 module_param(lbrv, int, 0444); 181 182 static int tsc_scaling = true; 183 module_param(tsc_scaling, int, 0444); 184 185 /* 186 * enable / disable AVIC. Because the defaults differ for APICv 187 * support between VMX and SVM we cannot use module_param_named. 188 */ 189 static bool avic; 190 module_param(avic, bool, 0444); 191 192 bool __read_mostly dump_invalid_vmcb; 193 module_param(dump_invalid_vmcb, bool, 0644); 194 195 196 bool intercept_smi = true; 197 module_param(intercept_smi, bool, 0444); 198 199 200 static bool svm_gp_erratum_intercept = true; 201 202 static u8 rsm_ins_bytes[] = "\x0f\xaa"; 203 204 static unsigned long iopm_base; 205 206 struct kvm_ldttss_desc { 207 u16 limit0; 208 u16 base0; 209 unsigned base1:8, type:5, dpl:2, p:1; 210 unsigned limit1:4, zero0:3, g:1, base2:8; 211 u32 base3; 212 u32 zero1; 213 } __attribute__((packed)); 214 215 DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 216 217 /* 218 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via 219 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. 220 * 221 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to 222 * defer the restoration of TSC_AUX until the CPU returns to userspace. 223 */ 224 static int tsc_aux_uret_slot __read_mostly = -1; 225 226 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 227 228 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 229 #define MSRS_RANGE_SIZE 2048 230 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 231 232 u32 svm_msrpm_offset(u32 msr) 233 { 234 u32 offset; 235 int i; 236 237 for (i = 0; i < NUM_MSR_MAPS; i++) { 238 if (msr < msrpm_ranges[i] || 239 msr >= msrpm_ranges[i] + MSRS_IN_RANGE) 240 continue; 241 242 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ 243 offset += (i * MSRS_RANGE_SIZE); /* add range offset */ 244 245 /* Now we have the u8 offset - but need the u32 offset */ 246 return offset / 4; 247 } 248 249 /* MSR not in any range */ 250 return MSR_INVALID; 251 } 252 253 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); 254 255 static int get_npt_level(void) 256 { 257 #ifdef CONFIG_X86_64 258 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; 259 #else 260 return PT32E_ROOT_LEVEL; 261 #endif 262 } 263 264 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 265 { 266 struct vcpu_svm *svm = to_svm(vcpu); 267 u64 old_efer = vcpu->arch.efer; 268 vcpu->arch.efer = efer; 269 270 if (!npt_enabled) { 271 /* Shadow paging assumes NX to be available. */ 272 efer |= EFER_NX; 273 274 if (!(efer & EFER_LMA)) 275 efer &= ~EFER_LME; 276 } 277 278 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { 279 if (!(efer & EFER_SVME)) { 280 svm_leave_nested(vcpu); 281 svm_set_gif(svm, true); 282 /* #GP intercept is still needed for vmware backdoor */ 283 if (!enable_vmware_backdoor) 284 clr_exception_intercept(svm, GP_VECTOR); 285 286 /* 287 * Free the nested guest state, unless we are in SMM. 288 * In this case we will return to the nested guest 289 * as soon as we leave SMM. 290 */ 291 if (!is_smm(vcpu)) 292 svm_free_nested(svm); 293 294 } else { 295 int ret = svm_allocate_nested(svm); 296 297 if (ret) { 298 vcpu->arch.efer = old_efer; 299 return ret; 300 } 301 302 /* 303 * Never intercept #GP for SEV guests, KVM can't 304 * decrypt guest memory to workaround the erratum. 305 */ 306 if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) 307 set_exception_intercept(svm, GP_VECTOR); 308 } 309 } 310 311 svm->vmcb->save.efer = efer | EFER_SVME; 312 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 313 return 0; 314 } 315 316 static int is_external_interrupt(u32 info) 317 { 318 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 319 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); 320 } 321 322 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) 323 { 324 struct vcpu_svm *svm = to_svm(vcpu); 325 u32 ret = 0; 326 327 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 328 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; 329 return ret; 330 } 331 332 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 333 { 334 struct vcpu_svm *svm = to_svm(vcpu); 335 336 if (mask == 0) 337 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 338 else 339 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; 340 341 } 342 343 static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) 344 { 345 struct vcpu_svm *svm = to_svm(vcpu); 346 347 /* 348 * SEV-ES does not expose the next RIP. The RIP update is controlled by 349 * the type of exit and the #VC handler in the guest. 350 */ 351 if (sev_es_guest(vcpu->kvm)) 352 goto done; 353 354 if (nrips && svm->vmcb->control.next_rip != 0) { 355 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); 356 svm->next_rip = svm->vmcb->control.next_rip; 357 } 358 359 if (!svm->next_rip) { 360 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 361 return 0; 362 } else { 363 kvm_rip_write(vcpu, svm->next_rip); 364 } 365 366 done: 367 svm_set_interrupt_shadow(vcpu, 0); 368 369 return 1; 370 } 371 372 static void svm_queue_exception(struct kvm_vcpu *vcpu) 373 { 374 struct vcpu_svm *svm = to_svm(vcpu); 375 unsigned nr = vcpu->arch.exception.nr; 376 bool has_error_code = vcpu->arch.exception.has_error_code; 377 u32 error_code = vcpu->arch.exception.error_code; 378 379 kvm_deliver_exception_payload(vcpu); 380 381 if (nr == BP_VECTOR && !nrips) { 382 unsigned long rip, old_rip = kvm_rip_read(vcpu); 383 384 /* 385 * For guest debugging where we have to reinject #BP if some 386 * INT3 is guest-owned: 387 * Emulate nRIP by moving RIP forward. Will fail if injection 388 * raises a fault that is not intercepted. Still better than 389 * failing in all cases. 390 */ 391 (void)svm_skip_emulated_instruction(vcpu); 392 rip = kvm_rip_read(vcpu); 393 svm->int3_rip = rip + svm->vmcb->save.cs.base; 394 svm->int3_injected = rip - old_rip; 395 } 396 397 svm->vmcb->control.event_inj = nr 398 | SVM_EVTINJ_VALID 399 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) 400 | SVM_EVTINJ_TYPE_EXEPT; 401 svm->vmcb->control.event_inj_err = error_code; 402 } 403 404 static void svm_init_erratum_383(void) 405 { 406 u32 low, high; 407 int err; 408 u64 val; 409 410 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) 411 return; 412 413 /* Use _safe variants to not break nested virtualization */ 414 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); 415 if (err) 416 return; 417 418 val |= (1ULL << 47); 419 420 low = lower_32_bits(val); 421 high = upper_32_bits(val); 422 423 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); 424 425 erratum_383_found = true; 426 } 427 428 static void svm_init_osvw(struct kvm_vcpu *vcpu) 429 { 430 /* 431 * Guests should see errata 400 and 415 as fixed (assuming that 432 * HLT and IO instructions are intercepted). 433 */ 434 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; 435 vcpu->arch.osvw.status = osvw_status & ~(6ULL); 436 437 /* 438 * By increasing VCPU's osvw.length to 3 we are telling the guest that 439 * all osvw.status bits inside that length, including bit 0 (which is 440 * reserved for erratum 298), are valid. However, if host processor's 441 * osvw_len is 0 then osvw_status[0] carries no information. We need to 442 * be conservative here and therefore we tell the guest that erratum 298 443 * is present (because we really don't know). 444 */ 445 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) 446 vcpu->arch.osvw.status |= 1; 447 } 448 449 static int has_svm(void) 450 { 451 const char *msg; 452 453 if (!cpu_has_svm(&msg)) { 454 printk(KERN_INFO "has_svm: %s\n", msg); 455 return 0; 456 } 457 458 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { 459 pr_info("KVM is unsupported when running as an SEV guest\n"); 460 return 0; 461 } 462 463 return 1; 464 } 465 466 static void svm_hardware_disable(void) 467 { 468 /* Make sure we clean up behind us */ 469 if (tsc_scaling) 470 wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT); 471 472 cpu_svm_disable(); 473 474 amd_pmu_disable_virt(); 475 } 476 477 static int svm_hardware_enable(void) 478 { 479 480 struct svm_cpu_data *sd; 481 uint64_t efer; 482 struct desc_struct *gdt; 483 int me = raw_smp_processor_id(); 484 485 rdmsrl(MSR_EFER, efer); 486 if (efer & EFER_SVME) 487 return -EBUSY; 488 489 if (!has_svm()) { 490 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); 491 return -EINVAL; 492 } 493 sd = per_cpu(svm_data, me); 494 if (!sd) { 495 pr_err("%s: svm_data is NULL on %d\n", __func__, me); 496 return -EINVAL; 497 } 498 499 sd->asid_generation = 1; 500 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 501 sd->next_asid = sd->max_asid + 1; 502 sd->min_asid = max_sev_asid + 1; 503 504 gdt = get_current_gdt_rw(); 505 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 506 507 wrmsrl(MSR_EFER, efer | EFER_SVME); 508 509 wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); 510 511 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 512 /* 513 * Set the default value, even if we don't use TSC scaling 514 * to avoid having stale value in the msr 515 */ 516 wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT); 517 __this_cpu_write(current_tsc_ratio, SVM_TSC_RATIO_DEFAULT); 518 } 519 520 521 /* 522 * Get OSVW bits. 523 * 524 * Note that it is possible to have a system with mixed processor 525 * revisions and therefore different OSVW bits. If bits are not the same 526 * on different processors then choose the worst case (i.e. if erratum 527 * is present on one processor and not on another then assume that the 528 * erratum is present everywhere). 529 */ 530 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { 531 uint64_t len, status = 0; 532 int err; 533 534 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); 535 if (!err) 536 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, 537 &err); 538 539 if (err) 540 osvw_status = osvw_len = 0; 541 else { 542 if (len < osvw_len) 543 osvw_len = len; 544 osvw_status |= status; 545 osvw_status &= (1ULL << osvw_len) - 1; 546 } 547 } else 548 osvw_status = osvw_len = 0; 549 550 svm_init_erratum_383(); 551 552 amd_pmu_enable_virt(); 553 554 return 0; 555 } 556 557 static void svm_cpu_uninit(int cpu) 558 { 559 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 560 561 if (!sd) 562 return; 563 564 per_cpu(svm_data, cpu) = NULL; 565 kfree(sd->sev_vmcbs); 566 __free_page(sd->save_area); 567 kfree(sd); 568 } 569 570 static int svm_cpu_init(int cpu) 571 { 572 struct svm_cpu_data *sd; 573 int ret = -ENOMEM; 574 575 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 576 if (!sd) 577 return ret; 578 sd->cpu = cpu; 579 sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); 580 if (!sd->save_area) 581 goto free_cpu_data; 582 583 ret = sev_cpu_init(sd); 584 if (ret) 585 goto free_save_area; 586 587 per_cpu(svm_data, cpu) = sd; 588 589 return 0; 590 591 free_save_area: 592 __free_page(sd->save_area); 593 free_cpu_data: 594 kfree(sd); 595 return ret; 596 597 } 598 599 static int direct_access_msr_slot(u32 msr) 600 { 601 u32 i; 602 603 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) 604 if (direct_access_msrs[i].index == msr) 605 return i; 606 607 return -ENOENT; 608 } 609 610 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, 611 int write) 612 { 613 struct vcpu_svm *svm = to_svm(vcpu); 614 int slot = direct_access_msr_slot(msr); 615 616 if (slot == -ENOENT) 617 return; 618 619 /* Set the shadow bitmaps to the desired intercept states */ 620 if (read) 621 set_bit(slot, svm->shadow_msr_intercept.read); 622 else 623 clear_bit(slot, svm->shadow_msr_intercept.read); 624 625 if (write) 626 set_bit(slot, svm->shadow_msr_intercept.write); 627 else 628 clear_bit(slot, svm->shadow_msr_intercept.write); 629 } 630 631 static bool valid_msr_intercept(u32 index) 632 { 633 return direct_access_msr_slot(index) != -ENOENT; 634 } 635 636 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) 637 { 638 u8 bit_write; 639 unsigned long tmp; 640 u32 offset; 641 u32 *msrpm; 642 643 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: 644 to_svm(vcpu)->msrpm; 645 646 offset = svm_msrpm_offset(msr); 647 bit_write = 2 * (msr & 0x0f) + 1; 648 tmp = msrpm[offset]; 649 650 BUG_ON(offset == MSR_INVALID); 651 652 return !!test_bit(bit_write, &tmp); 653 } 654 655 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, 656 u32 msr, int read, int write) 657 { 658 struct vcpu_svm *svm = to_svm(vcpu); 659 u8 bit_read, bit_write; 660 unsigned long tmp; 661 u32 offset; 662 663 /* 664 * If this warning triggers extend the direct_access_msrs list at the 665 * beginning of the file 666 */ 667 WARN_ON(!valid_msr_intercept(msr)); 668 669 /* Enforce non allowed MSRs to trap */ 670 if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 671 read = 0; 672 673 if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 674 write = 0; 675 676 offset = svm_msrpm_offset(msr); 677 bit_read = 2 * (msr & 0x0f); 678 bit_write = 2 * (msr & 0x0f) + 1; 679 tmp = msrpm[offset]; 680 681 BUG_ON(offset == MSR_INVALID); 682 683 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); 684 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); 685 686 msrpm[offset] = tmp; 687 688 svm_hv_vmcb_dirty_nested_enlightenments(vcpu); 689 svm->nested.force_msr_bitmap_recalc = true; 690 } 691 692 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, 693 int read, int write) 694 { 695 set_shadow_msr_intercept(vcpu, msr, read, write); 696 set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); 697 } 698 699 u32 *svm_vcpu_alloc_msrpm(void) 700 { 701 unsigned int order = get_order(MSRPM_SIZE); 702 struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); 703 u32 *msrpm; 704 705 if (!pages) 706 return NULL; 707 708 msrpm = page_address(pages); 709 memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); 710 711 return msrpm; 712 } 713 714 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) 715 { 716 int i; 717 718 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 719 if (!direct_access_msrs[i].always) 720 continue; 721 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); 722 } 723 } 724 725 726 void svm_vcpu_free_msrpm(u32 *msrpm) 727 { 728 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); 729 } 730 731 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) 732 { 733 struct vcpu_svm *svm = to_svm(vcpu); 734 u32 i; 735 736 /* 737 * Set intercept permissions for all direct access MSRs again. They 738 * will automatically get filtered through the MSR filter, so we are 739 * back in sync after this. 740 */ 741 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 742 u32 msr = direct_access_msrs[i].index; 743 u32 read = test_bit(i, svm->shadow_msr_intercept.read); 744 u32 write = test_bit(i, svm->shadow_msr_intercept.write); 745 746 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); 747 } 748 } 749 750 static void add_msr_offset(u32 offset) 751 { 752 int i; 753 754 for (i = 0; i < MSRPM_OFFSETS; ++i) { 755 756 /* Offset already in list? */ 757 if (msrpm_offsets[i] == offset) 758 return; 759 760 /* Slot used by another offset? */ 761 if (msrpm_offsets[i] != MSR_INVALID) 762 continue; 763 764 /* Add offset to list */ 765 msrpm_offsets[i] = offset; 766 767 return; 768 } 769 770 /* 771 * If this BUG triggers the msrpm_offsets table has an overflow. Just 772 * increase MSRPM_OFFSETS in this case. 773 */ 774 BUG(); 775 } 776 777 static void init_msrpm_offsets(void) 778 { 779 int i; 780 781 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); 782 783 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 784 u32 offset; 785 786 offset = svm_msrpm_offset(direct_access_msrs[i].index); 787 BUG_ON(offset == MSR_INVALID); 788 789 add_msr_offset(offset); 790 } 791 } 792 793 static void svm_enable_lbrv(struct kvm_vcpu *vcpu) 794 { 795 struct vcpu_svm *svm = to_svm(vcpu); 796 797 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 798 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); 799 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); 800 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); 801 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); 802 } 803 804 static void svm_disable_lbrv(struct kvm_vcpu *vcpu) 805 { 806 struct vcpu_svm *svm = to_svm(vcpu); 807 808 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 809 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); 810 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); 811 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); 812 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); 813 } 814 815 void disable_nmi_singlestep(struct vcpu_svm *svm) 816 { 817 svm->nmi_singlestep = false; 818 819 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 820 /* Clear our flags if they were not set by the guest */ 821 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 822 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; 823 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 824 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; 825 } 826 } 827 828 static void grow_ple_window(struct kvm_vcpu *vcpu) 829 { 830 struct vcpu_svm *svm = to_svm(vcpu); 831 struct vmcb_control_area *control = &svm->vmcb->control; 832 int old = control->pause_filter_count; 833 834 control->pause_filter_count = __grow_ple_window(old, 835 pause_filter_count, 836 pause_filter_count_grow, 837 pause_filter_count_max); 838 839 if (control->pause_filter_count != old) { 840 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 841 trace_kvm_ple_window_update(vcpu->vcpu_id, 842 control->pause_filter_count, old); 843 } 844 } 845 846 static void shrink_ple_window(struct kvm_vcpu *vcpu) 847 { 848 struct vcpu_svm *svm = to_svm(vcpu); 849 struct vmcb_control_area *control = &svm->vmcb->control; 850 int old = control->pause_filter_count; 851 852 control->pause_filter_count = 853 __shrink_ple_window(old, 854 pause_filter_count, 855 pause_filter_count_shrink, 856 pause_filter_count); 857 if (control->pause_filter_count != old) { 858 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 859 trace_kvm_ple_window_update(vcpu->vcpu_id, 860 control->pause_filter_count, old); 861 } 862 } 863 864 static void svm_hardware_unsetup(void) 865 { 866 int cpu; 867 868 sev_hardware_unsetup(); 869 870 for_each_possible_cpu(cpu) 871 svm_cpu_uninit(cpu); 872 873 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), 874 get_order(IOPM_SIZE)); 875 iopm_base = 0; 876 } 877 878 static void init_seg(struct vmcb_seg *seg) 879 { 880 seg->selector = 0; 881 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 882 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 883 seg->limit = 0xffff; 884 seg->base = 0; 885 } 886 887 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) 888 { 889 seg->selector = 0; 890 seg->attrib = SVM_SELECTOR_P_MASK | type; 891 seg->limit = 0xffff; 892 seg->base = 0; 893 } 894 895 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 896 { 897 struct vcpu_svm *svm = to_svm(vcpu); 898 899 return svm->nested.ctl.tsc_offset; 900 } 901 902 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 903 { 904 struct vcpu_svm *svm = to_svm(vcpu); 905 906 return svm->tsc_ratio_msr; 907 } 908 909 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 910 { 911 struct vcpu_svm *svm = to_svm(vcpu); 912 913 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; 914 svm->vmcb->control.tsc_offset = offset; 915 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 916 } 917 918 void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) 919 { 920 wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); 921 } 922 923 /* Evaluate instruction intercepts that depend on guest CPUID features. */ 924 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, 925 struct vcpu_svm *svm) 926 { 927 /* 928 * Intercept INVPCID if shadow paging is enabled to sync/free shadow 929 * roots, or if INVPCID is disabled in the guest to inject #UD. 930 */ 931 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { 932 if (!npt_enabled || 933 !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) 934 svm_set_intercept(svm, INTERCEPT_INVPCID); 935 else 936 svm_clr_intercept(svm, INTERCEPT_INVPCID); 937 } 938 939 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { 940 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 941 svm_clr_intercept(svm, INTERCEPT_RDTSCP); 942 else 943 svm_set_intercept(svm, INTERCEPT_RDTSCP); 944 } 945 } 946 947 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) 948 { 949 struct vcpu_svm *svm = to_svm(vcpu); 950 951 if (guest_cpuid_is_intel(vcpu)) { 952 /* 953 * We must intercept SYSENTER_EIP and SYSENTER_ESP 954 * accesses because the processor only stores 32 bits. 955 * For the same reason we cannot use virtual VMLOAD/VMSAVE. 956 */ 957 svm_set_intercept(svm, INTERCEPT_VMLOAD); 958 svm_set_intercept(svm, INTERCEPT_VMSAVE); 959 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 960 961 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); 962 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); 963 } else { 964 /* 965 * If hardware supports Virtual VMLOAD VMSAVE then enable it 966 * in VMCB and clear intercepts to avoid #VMEXIT. 967 */ 968 if (vls) { 969 svm_clr_intercept(svm, INTERCEPT_VMLOAD); 970 svm_clr_intercept(svm, INTERCEPT_VMSAVE); 971 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 972 } 973 /* No need to intercept these MSRs */ 974 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); 975 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); 976 } 977 } 978 979 static void init_vmcb(struct kvm_vcpu *vcpu) 980 { 981 struct vcpu_svm *svm = to_svm(vcpu); 982 struct vmcb_control_area *control = &svm->vmcb->control; 983 struct vmcb_save_area *save = &svm->vmcb->save; 984 985 svm_set_intercept(svm, INTERCEPT_CR0_READ); 986 svm_set_intercept(svm, INTERCEPT_CR3_READ); 987 svm_set_intercept(svm, INTERCEPT_CR4_READ); 988 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 989 svm_set_intercept(svm, INTERCEPT_CR3_WRITE); 990 svm_set_intercept(svm, INTERCEPT_CR4_WRITE); 991 if (!kvm_vcpu_apicv_active(vcpu)) 992 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 993 994 set_dr_intercepts(svm); 995 996 set_exception_intercept(svm, PF_VECTOR); 997 set_exception_intercept(svm, UD_VECTOR); 998 set_exception_intercept(svm, MC_VECTOR); 999 set_exception_intercept(svm, AC_VECTOR); 1000 set_exception_intercept(svm, DB_VECTOR); 1001 /* 1002 * Guest access to VMware backdoor ports could legitimately 1003 * trigger #GP because of TSS I/O permission bitmap. 1004 * We intercept those #GP and allow access to them anyway 1005 * as VMware does. Don't intercept #GP for SEV guests as KVM can't 1006 * decrypt guest memory to decode the faulting instruction. 1007 */ 1008 if (enable_vmware_backdoor && !sev_guest(vcpu->kvm)) 1009 set_exception_intercept(svm, GP_VECTOR); 1010 1011 svm_set_intercept(svm, INTERCEPT_INTR); 1012 svm_set_intercept(svm, INTERCEPT_NMI); 1013 1014 if (intercept_smi) 1015 svm_set_intercept(svm, INTERCEPT_SMI); 1016 1017 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1018 svm_set_intercept(svm, INTERCEPT_RDPMC); 1019 svm_set_intercept(svm, INTERCEPT_CPUID); 1020 svm_set_intercept(svm, INTERCEPT_INVD); 1021 svm_set_intercept(svm, INTERCEPT_INVLPG); 1022 svm_set_intercept(svm, INTERCEPT_INVLPGA); 1023 svm_set_intercept(svm, INTERCEPT_IOIO_PROT); 1024 svm_set_intercept(svm, INTERCEPT_MSR_PROT); 1025 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH); 1026 svm_set_intercept(svm, INTERCEPT_SHUTDOWN); 1027 svm_set_intercept(svm, INTERCEPT_VMRUN); 1028 svm_set_intercept(svm, INTERCEPT_VMMCALL); 1029 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1030 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1031 svm_set_intercept(svm, INTERCEPT_STGI); 1032 svm_set_intercept(svm, INTERCEPT_CLGI); 1033 svm_set_intercept(svm, INTERCEPT_SKINIT); 1034 svm_set_intercept(svm, INTERCEPT_WBINVD); 1035 svm_set_intercept(svm, INTERCEPT_XSETBV); 1036 svm_set_intercept(svm, INTERCEPT_RDPRU); 1037 svm_set_intercept(svm, INTERCEPT_RSM); 1038 1039 if (!kvm_mwait_in_guest(vcpu->kvm)) { 1040 svm_set_intercept(svm, INTERCEPT_MONITOR); 1041 svm_set_intercept(svm, INTERCEPT_MWAIT); 1042 } 1043 1044 if (!kvm_hlt_in_guest(vcpu->kvm)) 1045 svm_set_intercept(svm, INTERCEPT_HLT); 1046 1047 control->iopm_base_pa = __sme_set(iopm_base); 1048 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1049 control->int_ctl = V_INTR_MASKING_MASK; 1050 1051 init_seg(&save->es); 1052 init_seg(&save->ss); 1053 init_seg(&save->ds); 1054 init_seg(&save->fs); 1055 init_seg(&save->gs); 1056 1057 save->cs.selector = 0xf000; 1058 save->cs.base = 0xffff0000; 1059 /* Executable/Readable Code Segment */ 1060 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1061 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1062 save->cs.limit = 0xffff; 1063 1064 save->gdtr.base = 0; 1065 save->gdtr.limit = 0xffff; 1066 save->idtr.base = 0; 1067 save->idtr.limit = 0xffff; 1068 1069 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1070 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1071 1072 if (npt_enabled) { 1073 /* Setup VMCB for Nested Paging */ 1074 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 1075 svm_clr_intercept(svm, INTERCEPT_INVLPG); 1076 clr_exception_intercept(svm, PF_VECTOR); 1077 svm_clr_intercept(svm, INTERCEPT_CR3_READ); 1078 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); 1079 save->g_pat = vcpu->arch.pat; 1080 save->cr3 = 0; 1081 } 1082 svm->current_vmcb->asid_generation = 0; 1083 svm->asid = 0; 1084 1085 svm->nested.vmcb12_gpa = INVALID_GPA; 1086 svm->nested.last_vmcb12_gpa = INVALID_GPA; 1087 1088 if (!kvm_pause_in_guest(vcpu->kvm)) { 1089 control->pause_filter_count = pause_filter_count; 1090 if (pause_filter_thresh) 1091 control->pause_filter_thresh = pause_filter_thresh; 1092 svm_set_intercept(svm, INTERCEPT_PAUSE); 1093 } else { 1094 svm_clr_intercept(svm, INTERCEPT_PAUSE); 1095 } 1096 1097 svm_recalc_instruction_intercepts(vcpu, svm); 1098 1099 /* 1100 * If the host supports V_SPEC_CTRL then disable the interception 1101 * of MSR_IA32_SPEC_CTRL. 1102 */ 1103 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 1104 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 1105 1106 if (kvm_vcpu_apicv_active(vcpu)) 1107 avic_init_vmcb(svm); 1108 1109 if (vgif) { 1110 svm_clr_intercept(svm, INTERCEPT_STGI); 1111 svm_clr_intercept(svm, INTERCEPT_CLGI); 1112 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1113 } 1114 1115 if (sev_guest(vcpu->kvm)) { 1116 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; 1117 clr_exception_intercept(svm, UD_VECTOR); 1118 1119 if (sev_es_guest(vcpu->kvm)) { 1120 /* Perform SEV-ES specific VMCB updates */ 1121 sev_es_init_vmcb(svm); 1122 } 1123 } 1124 1125 svm_hv_init_vmcb(svm->vmcb); 1126 init_vmcb_after_set_cpuid(vcpu); 1127 1128 vmcb_mark_all_dirty(svm->vmcb); 1129 1130 enable_gif(svm); 1131 } 1132 1133 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) 1134 { 1135 struct vcpu_svm *svm = to_svm(vcpu); 1136 1137 svm_vcpu_init_msrpm(vcpu, svm->msrpm); 1138 1139 svm_init_osvw(vcpu); 1140 vcpu->arch.microcode_version = 0x01000065; 1141 svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; 1142 1143 if (sev_es_guest(vcpu->kvm)) 1144 sev_es_vcpu_reset(svm); 1145 } 1146 1147 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 1148 { 1149 struct vcpu_svm *svm = to_svm(vcpu); 1150 1151 svm->spec_ctrl = 0; 1152 svm->virt_spec_ctrl = 0; 1153 1154 init_vmcb(vcpu); 1155 1156 if (!init_event) 1157 __svm_vcpu_reset(vcpu); 1158 } 1159 1160 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) 1161 { 1162 svm->current_vmcb = target_vmcb; 1163 svm->vmcb = target_vmcb->ptr; 1164 } 1165 1166 static int svm_vcpu_create(struct kvm_vcpu *vcpu) 1167 { 1168 struct vcpu_svm *svm; 1169 struct page *vmcb01_page; 1170 struct page *vmsa_page = NULL; 1171 int err; 1172 1173 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); 1174 svm = to_svm(vcpu); 1175 1176 err = -ENOMEM; 1177 vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1178 if (!vmcb01_page) 1179 goto out; 1180 1181 if (sev_es_guest(vcpu->kvm)) { 1182 /* 1183 * SEV-ES guests require a separate VMSA page used to contain 1184 * the encrypted register state of the guest. 1185 */ 1186 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1187 if (!vmsa_page) 1188 goto error_free_vmcb_page; 1189 1190 /* 1191 * SEV-ES guests maintain an encrypted version of their FPU 1192 * state which is restored and saved on VMRUN and VMEXIT. 1193 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't 1194 * do xsave/xrstor on it. 1195 */ 1196 fpstate_set_confidential(&vcpu->arch.guest_fpu); 1197 } 1198 1199 err = avic_init_vcpu(svm); 1200 if (err) 1201 goto error_free_vmsa_page; 1202 1203 svm->msrpm = svm_vcpu_alloc_msrpm(); 1204 if (!svm->msrpm) { 1205 err = -ENOMEM; 1206 goto error_free_vmsa_page; 1207 } 1208 1209 svm->vmcb01.ptr = page_address(vmcb01_page); 1210 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); 1211 svm_switch_vmcb(svm, &svm->vmcb01); 1212 1213 if (vmsa_page) 1214 svm->sev_es.vmsa = page_address(vmsa_page); 1215 1216 svm->guest_state_loaded = false; 1217 1218 return 0; 1219 1220 error_free_vmsa_page: 1221 if (vmsa_page) 1222 __free_page(vmsa_page); 1223 error_free_vmcb_page: 1224 __free_page(vmcb01_page); 1225 out: 1226 return err; 1227 } 1228 1229 static void svm_clear_current_vmcb(struct vmcb *vmcb) 1230 { 1231 int i; 1232 1233 for_each_online_cpu(i) 1234 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); 1235 } 1236 1237 static void svm_vcpu_free(struct kvm_vcpu *vcpu) 1238 { 1239 struct vcpu_svm *svm = to_svm(vcpu); 1240 1241 /* 1242 * The vmcb page can be recycled, causing a false negative in 1243 * svm_vcpu_load(). So, ensure that no logical CPU has this 1244 * vmcb page recorded as its current vmcb. 1245 */ 1246 svm_clear_current_vmcb(svm->vmcb); 1247 1248 svm_free_nested(svm); 1249 1250 sev_free_vcpu(vcpu); 1251 1252 __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); 1253 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); 1254 } 1255 1256 static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) 1257 { 1258 struct vcpu_svm *svm = to_svm(vcpu); 1259 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 1260 1261 if (sev_es_guest(vcpu->kvm)) 1262 sev_es_unmap_ghcb(svm); 1263 1264 if (svm->guest_state_loaded) 1265 return; 1266 1267 /* 1268 * Save additional host state that will be restored on VMEXIT (sev-es) 1269 * or subsequent vmload of host save area. 1270 */ 1271 vmsave(__sme_page_pa(sd->save_area)); 1272 if (sev_es_guest(vcpu->kvm)) { 1273 struct vmcb_save_area *hostsa; 1274 hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); 1275 1276 sev_es_prepare_switch_to_guest(hostsa); 1277 } 1278 1279 if (tsc_scaling) { 1280 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; 1281 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { 1282 __this_cpu_write(current_tsc_ratio, tsc_ratio); 1283 wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); 1284 } 1285 } 1286 1287 if (likely(tsc_aux_uret_slot >= 0)) 1288 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); 1289 1290 svm->guest_state_loaded = true; 1291 } 1292 1293 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) 1294 { 1295 to_svm(vcpu)->guest_state_loaded = false; 1296 } 1297 1298 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1299 { 1300 struct vcpu_svm *svm = to_svm(vcpu); 1301 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 1302 1303 if (sd->current_vmcb != svm->vmcb) { 1304 sd->current_vmcb = svm->vmcb; 1305 indirect_branch_prediction_barrier(); 1306 } 1307 if (kvm_vcpu_apicv_active(vcpu)) 1308 __avic_vcpu_load(vcpu, cpu); 1309 } 1310 1311 static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1312 { 1313 if (kvm_vcpu_apicv_active(vcpu)) 1314 __avic_vcpu_put(vcpu); 1315 1316 svm_prepare_host_switch(vcpu); 1317 1318 ++vcpu->stat.host_state_reload; 1319 } 1320 1321 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1322 { 1323 struct vcpu_svm *svm = to_svm(vcpu); 1324 unsigned long rflags = svm->vmcb->save.rflags; 1325 1326 if (svm->nmi_singlestep) { 1327 /* Hide our flags if they were not set by the guest */ 1328 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 1329 rflags &= ~X86_EFLAGS_TF; 1330 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 1331 rflags &= ~X86_EFLAGS_RF; 1332 } 1333 return rflags; 1334 } 1335 1336 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1337 { 1338 if (to_svm(vcpu)->nmi_singlestep) 1339 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 1340 1341 /* 1342 * Any change of EFLAGS.VM is accompanied by a reload of SS 1343 * (caused by either a task switch or an inter-privilege IRET), 1344 * so we do not need to update the CPL here. 1345 */ 1346 to_svm(vcpu)->vmcb->save.rflags = rflags; 1347 } 1348 1349 static bool svm_get_if_flag(struct kvm_vcpu *vcpu) 1350 { 1351 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1352 1353 return sev_es_guest(vcpu->kvm) 1354 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK 1355 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; 1356 } 1357 1358 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1359 { 1360 kvm_register_mark_available(vcpu, reg); 1361 1362 switch (reg) { 1363 case VCPU_EXREG_PDPTR: 1364 /* 1365 * When !npt_enabled, mmu->pdptrs[] is already available since 1366 * it is always updated per SDM when moving to CRs. 1367 */ 1368 if (npt_enabled) 1369 load_pdptrs(vcpu, kvm_read_cr3(vcpu)); 1370 break; 1371 default: 1372 KVM_BUG_ON(1, vcpu->kvm); 1373 } 1374 } 1375 1376 static void svm_set_vintr(struct vcpu_svm *svm) 1377 { 1378 struct vmcb_control_area *control; 1379 1380 /* 1381 * The following fields are ignored when AVIC is enabled 1382 */ 1383 WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); 1384 1385 svm_set_intercept(svm, INTERCEPT_VINTR); 1386 1387 /* 1388 * This is just a dummy VINTR to actually cause a vmexit to happen. 1389 * Actual injection of virtual interrupts happens through EVENTINJ. 1390 */ 1391 control = &svm->vmcb->control; 1392 control->int_vector = 0x0; 1393 control->int_ctl &= ~V_INTR_PRIO_MASK; 1394 control->int_ctl |= V_IRQ_MASK | 1395 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 1396 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1397 } 1398 1399 static void svm_clear_vintr(struct vcpu_svm *svm) 1400 { 1401 svm_clr_intercept(svm, INTERCEPT_VINTR); 1402 1403 /* Drop int_ctl fields related to VINTR injection. */ 1404 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1405 if (is_guest_mode(&svm->vcpu)) { 1406 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1407 1408 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != 1409 (svm->nested.ctl.int_ctl & V_TPR_MASK)); 1410 1411 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & 1412 V_IRQ_INJECTION_BITS_MASK; 1413 1414 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; 1415 } 1416 1417 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1418 } 1419 1420 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1421 { 1422 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1423 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save; 1424 1425 switch (seg) { 1426 case VCPU_SREG_CS: return &save->cs; 1427 case VCPU_SREG_DS: return &save->ds; 1428 case VCPU_SREG_ES: return &save->es; 1429 case VCPU_SREG_FS: return &save01->fs; 1430 case VCPU_SREG_GS: return &save01->gs; 1431 case VCPU_SREG_SS: return &save->ss; 1432 case VCPU_SREG_TR: return &save01->tr; 1433 case VCPU_SREG_LDTR: return &save01->ldtr; 1434 } 1435 BUG(); 1436 return NULL; 1437 } 1438 1439 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) 1440 { 1441 struct vmcb_seg *s = svm_seg(vcpu, seg); 1442 1443 return s->base; 1444 } 1445 1446 static void svm_get_segment(struct kvm_vcpu *vcpu, 1447 struct kvm_segment *var, int seg) 1448 { 1449 struct vmcb_seg *s = svm_seg(vcpu, seg); 1450 1451 var->base = s->base; 1452 var->limit = s->limit; 1453 var->selector = s->selector; 1454 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; 1455 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; 1456 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; 1457 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; 1458 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; 1459 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 1460 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1461 1462 /* 1463 * AMD CPUs circa 2014 track the G bit for all segments except CS. 1464 * However, the SVM spec states that the G bit is not observed by the 1465 * CPU, and some VMware virtual CPUs drop the G bit for all segments. 1466 * So let's synthesize a legal G bit for all segments, this helps 1467 * running KVM nested. It also helps cross-vendor migration, because 1468 * Intel's vmentry has a check on the 'G' bit. 1469 */ 1470 var->g = s->limit > 0xfffff; 1471 1472 /* 1473 * AMD's VMCB does not have an explicit unusable field, so emulate it 1474 * for cross vendor migration purposes by "not present" 1475 */ 1476 var->unusable = !var->present; 1477 1478 switch (seg) { 1479 case VCPU_SREG_TR: 1480 /* 1481 * Work around a bug where the busy flag in the tr selector 1482 * isn't exposed 1483 */ 1484 var->type |= 0x2; 1485 break; 1486 case VCPU_SREG_DS: 1487 case VCPU_SREG_ES: 1488 case VCPU_SREG_FS: 1489 case VCPU_SREG_GS: 1490 /* 1491 * The accessed bit must always be set in the segment 1492 * descriptor cache, although it can be cleared in the 1493 * descriptor, the cached bit always remains at 1. Since 1494 * Intel has a check on this, set it here to support 1495 * cross-vendor migration. 1496 */ 1497 if (!var->unusable) 1498 var->type |= 0x1; 1499 break; 1500 case VCPU_SREG_SS: 1501 /* 1502 * On AMD CPUs sometimes the DB bit in the segment 1503 * descriptor is left as 1, although the whole segment has 1504 * been made unusable. Clear it here to pass an Intel VMX 1505 * entry check when cross vendor migrating. 1506 */ 1507 if (var->unusable) 1508 var->db = 0; 1509 /* This is symmetric with svm_set_segment() */ 1510 var->dpl = to_svm(vcpu)->vmcb->save.cpl; 1511 break; 1512 } 1513 } 1514 1515 static int svm_get_cpl(struct kvm_vcpu *vcpu) 1516 { 1517 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1518 1519 return save->cpl; 1520 } 1521 1522 static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 1523 { 1524 struct kvm_segment cs; 1525 1526 svm_get_segment(vcpu, &cs, VCPU_SREG_CS); 1527 *db = cs.db; 1528 *l = cs.l; 1529 } 1530 1531 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1532 { 1533 struct vcpu_svm *svm = to_svm(vcpu); 1534 1535 dt->size = svm->vmcb->save.idtr.limit; 1536 dt->address = svm->vmcb->save.idtr.base; 1537 } 1538 1539 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1540 { 1541 struct vcpu_svm *svm = to_svm(vcpu); 1542 1543 svm->vmcb->save.idtr.limit = dt->size; 1544 svm->vmcb->save.idtr.base = dt->address ; 1545 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1546 } 1547 1548 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1549 { 1550 struct vcpu_svm *svm = to_svm(vcpu); 1551 1552 dt->size = svm->vmcb->save.gdtr.limit; 1553 dt->address = svm->vmcb->save.gdtr.base; 1554 } 1555 1556 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1557 { 1558 struct vcpu_svm *svm = to_svm(vcpu); 1559 1560 svm->vmcb->save.gdtr.limit = dt->size; 1561 svm->vmcb->save.gdtr.base = dt->address ; 1562 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1563 } 1564 1565 static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1566 { 1567 struct vcpu_svm *svm = to_svm(vcpu); 1568 1569 /* 1570 * For guests that don't set guest_state_protected, the cr3 update is 1571 * handled via kvm_mmu_load() while entering the guest. For guests 1572 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to 1573 * VMCB save area now, since the save area will become the initial 1574 * contents of the VMSA, and future VMCB save area updates won't be 1575 * seen. 1576 */ 1577 if (sev_es_guest(vcpu->kvm)) { 1578 svm->vmcb->save.cr3 = cr3; 1579 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1580 } 1581 } 1582 1583 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1584 { 1585 struct vcpu_svm *svm = to_svm(vcpu); 1586 u64 hcr0 = cr0; 1587 bool old_paging = is_paging(vcpu); 1588 1589 #ifdef CONFIG_X86_64 1590 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { 1591 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1592 vcpu->arch.efer |= EFER_LMA; 1593 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1594 } 1595 1596 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1597 vcpu->arch.efer &= ~EFER_LMA; 1598 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1599 } 1600 } 1601 #endif 1602 vcpu->arch.cr0 = cr0; 1603 1604 if (!npt_enabled) { 1605 hcr0 |= X86_CR0_PG | X86_CR0_WP; 1606 if (old_paging != is_paging(vcpu)) 1607 svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); 1608 } 1609 1610 /* 1611 * re-enable caching here because the QEMU bios 1612 * does not do it - this results in some delay at 1613 * reboot 1614 */ 1615 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 1616 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1617 1618 svm->vmcb->save.cr0 = hcr0; 1619 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1620 1621 /* 1622 * SEV-ES guests must always keep the CR intercepts cleared. CR 1623 * tracking is done using the CR write traps. 1624 */ 1625 if (sev_es_guest(vcpu->kvm)) 1626 return; 1627 1628 if (hcr0 == cr0) { 1629 /* Selective CR0 write remains on. */ 1630 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 1631 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 1632 } else { 1633 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1634 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1635 } 1636 } 1637 1638 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1639 { 1640 return true; 1641 } 1642 1643 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1644 { 1645 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; 1646 unsigned long old_cr4 = vcpu->arch.cr4; 1647 1648 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1649 svm_flush_tlb_current(vcpu); 1650 1651 vcpu->arch.cr4 = cr4; 1652 if (!npt_enabled) { 1653 cr4 |= X86_CR4_PAE; 1654 1655 if (!is_paging(vcpu)) 1656 cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); 1657 } 1658 cr4 |= host_cr4_mce; 1659 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1660 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 1661 1662 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 1663 kvm_update_cpuid_runtime(vcpu); 1664 } 1665 1666 static void svm_set_segment(struct kvm_vcpu *vcpu, 1667 struct kvm_segment *var, int seg) 1668 { 1669 struct vcpu_svm *svm = to_svm(vcpu); 1670 struct vmcb_seg *s = svm_seg(vcpu, seg); 1671 1672 s->base = var->base; 1673 s->limit = var->limit; 1674 s->selector = var->selector; 1675 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); 1676 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; 1677 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; 1678 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; 1679 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; 1680 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; 1681 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 1682 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1683 1684 /* 1685 * This is always accurate, except if SYSRET returned to a segment 1686 * with SS.DPL != 3. Intel does not have this quirk, and always 1687 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it 1688 * would entail passing the CPL to userspace and back. 1689 */ 1690 if (seg == VCPU_SREG_SS) 1691 /* This is symmetric with svm_get_segment() */ 1692 svm->vmcb->save.cpl = (var->dpl & 3); 1693 1694 vmcb_mark_dirty(svm->vmcb, VMCB_SEG); 1695 } 1696 1697 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu) 1698 { 1699 struct vcpu_svm *svm = to_svm(vcpu); 1700 1701 clr_exception_intercept(svm, BP_VECTOR); 1702 1703 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1704 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1705 set_exception_intercept(svm, BP_VECTOR); 1706 } 1707 } 1708 1709 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1710 { 1711 if (sd->next_asid > sd->max_asid) { 1712 ++sd->asid_generation; 1713 sd->next_asid = sd->min_asid; 1714 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1715 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 1716 } 1717 1718 svm->current_vmcb->asid_generation = sd->asid_generation; 1719 svm->asid = sd->next_asid++; 1720 } 1721 1722 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) 1723 { 1724 struct vmcb *vmcb = svm->vmcb; 1725 1726 if (svm->vcpu.arch.guest_state_protected) 1727 return; 1728 1729 if (unlikely(value != vmcb->save.dr6)) { 1730 vmcb->save.dr6 = value; 1731 vmcb_mark_dirty(vmcb, VMCB_DR); 1732 } 1733 } 1734 1735 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 1736 { 1737 struct vcpu_svm *svm = to_svm(vcpu); 1738 1739 if (vcpu->arch.guest_state_protected) 1740 return; 1741 1742 get_debugreg(vcpu->arch.db[0], 0); 1743 get_debugreg(vcpu->arch.db[1], 1); 1744 get_debugreg(vcpu->arch.db[2], 2); 1745 get_debugreg(vcpu->arch.db[3], 3); 1746 /* 1747 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here, 1748 * because db_interception might need it. We can do it before vmentry. 1749 */ 1750 vcpu->arch.dr6 = svm->vmcb->save.dr6; 1751 vcpu->arch.dr7 = svm->vmcb->save.dr7; 1752 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 1753 set_dr_intercepts(svm); 1754 } 1755 1756 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1757 { 1758 struct vcpu_svm *svm = to_svm(vcpu); 1759 1760 if (vcpu->arch.guest_state_protected) 1761 return; 1762 1763 svm->vmcb->save.dr7 = value; 1764 vmcb_mark_dirty(svm->vmcb, VMCB_DR); 1765 } 1766 1767 static int pf_interception(struct kvm_vcpu *vcpu) 1768 { 1769 struct vcpu_svm *svm = to_svm(vcpu); 1770 1771 u64 fault_address = svm->vmcb->control.exit_info_2; 1772 u64 error_code = svm->vmcb->control.exit_info_1; 1773 1774 return kvm_handle_page_fault(vcpu, error_code, fault_address, 1775 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 1776 svm->vmcb->control.insn_bytes : NULL, 1777 svm->vmcb->control.insn_len); 1778 } 1779 1780 static int npf_interception(struct kvm_vcpu *vcpu) 1781 { 1782 struct vcpu_svm *svm = to_svm(vcpu); 1783 1784 u64 fault_address = svm->vmcb->control.exit_info_2; 1785 u64 error_code = svm->vmcb->control.exit_info_1; 1786 1787 trace_kvm_page_fault(fault_address, error_code); 1788 return kvm_mmu_page_fault(vcpu, fault_address, error_code, 1789 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 1790 svm->vmcb->control.insn_bytes : NULL, 1791 svm->vmcb->control.insn_len); 1792 } 1793 1794 static int db_interception(struct kvm_vcpu *vcpu) 1795 { 1796 struct kvm_run *kvm_run = vcpu->run; 1797 struct vcpu_svm *svm = to_svm(vcpu); 1798 1799 if (!(vcpu->guest_debug & 1800 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1801 !svm->nmi_singlestep) { 1802 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; 1803 kvm_queue_exception_p(vcpu, DB_VECTOR, payload); 1804 return 1; 1805 } 1806 1807 if (svm->nmi_singlestep) { 1808 disable_nmi_singlestep(svm); 1809 /* Make sure we check for pending NMIs upon entry */ 1810 kvm_make_request(KVM_REQ_EVENT, vcpu); 1811 } 1812 1813 if (vcpu->guest_debug & 1814 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { 1815 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1816 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; 1817 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; 1818 kvm_run->debug.arch.pc = 1819 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1820 kvm_run->debug.arch.exception = DB_VECTOR; 1821 return 0; 1822 } 1823 1824 return 1; 1825 } 1826 1827 static int bp_interception(struct kvm_vcpu *vcpu) 1828 { 1829 struct vcpu_svm *svm = to_svm(vcpu); 1830 struct kvm_run *kvm_run = vcpu->run; 1831 1832 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1833 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1834 kvm_run->debug.arch.exception = BP_VECTOR; 1835 return 0; 1836 } 1837 1838 static int ud_interception(struct kvm_vcpu *vcpu) 1839 { 1840 return handle_ud(vcpu); 1841 } 1842 1843 static int ac_interception(struct kvm_vcpu *vcpu) 1844 { 1845 kvm_queue_exception_e(vcpu, AC_VECTOR, 0); 1846 return 1; 1847 } 1848 1849 static bool is_erratum_383(void) 1850 { 1851 int err, i; 1852 u64 value; 1853 1854 if (!erratum_383_found) 1855 return false; 1856 1857 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); 1858 if (err) 1859 return false; 1860 1861 /* Bit 62 may or may not be set for this mce */ 1862 value &= ~(1ULL << 62); 1863 1864 if (value != 0xb600000000010015ULL) 1865 return false; 1866 1867 /* Clear MCi_STATUS registers */ 1868 for (i = 0; i < 6; ++i) 1869 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); 1870 1871 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); 1872 if (!err) { 1873 u32 low, high; 1874 1875 value &= ~(1ULL << 2); 1876 low = lower_32_bits(value); 1877 high = upper_32_bits(value); 1878 1879 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); 1880 } 1881 1882 /* Flush tlb to evict multi-match entries */ 1883 __flush_tlb_all(); 1884 1885 return true; 1886 } 1887 1888 static void svm_handle_mce(struct kvm_vcpu *vcpu) 1889 { 1890 if (is_erratum_383()) { 1891 /* 1892 * Erratum 383 triggered. Guest state is corrupt so kill the 1893 * guest. 1894 */ 1895 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 1896 1897 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 1898 1899 return; 1900 } 1901 1902 /* 1903 * On an #MC intercept the MCE handler is not called automatically in 1904 * the host. So do it by hand here. 1905 */ 1906 kvm_machine_check(); 1907 } 1908 1909 static int mc_interception(struct kvm_vcpu *vcpu) 1910 { 1911 return 1; 1912 } 1913 1914 static int shutdown_interception(struct kvm_vcpu *vcpu) 1915 { 1916 struct kvm_run *kvm_run = vcpu->run; 1917 struct vcpu_svm *svm = to_svm(vcpu); 1918 1919 /* 1920 * The VM save area has already been encrypted so it 1921 * cannot be reinitialized - just terminate. 1922 */ 1923 if (sev_es_guest(vcpu->kvm)) 1924 return -EINVAL; 1925 1926 /* 1927 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put 1928 * the VMCB in a known good state. Unfortuately, KVM doesn't have 1929 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking 1930 * userspace. At a platform view, INIT is acceptable behavior as 1931 * there exist bare metal platforms that automatically INIT the CPU 1932 * in response to shutdown. 1933 */ 1934 clear_page(svm->vmcb); 1935 kvm_vcpu_reset(vcpu, true); 1936 1937 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 1938 return 0; 1939 } 1940 1941 static int io_interception(struct kvm_vcpu *vcpu) 1942 { 1943 struct vcpu_svm *svm = to_svm(vcpu); 1944 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1945 int size, in, string; 1946 unsigned port; 1947 1948 ++vcpu->stat.io_exits; 1949 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1950 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1951 port = io_info >> 16; 1952 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1953 1954 if (string) { 1955 if (sev_es_guest(vcpu->kvm)) 1956 return sev_es_string_io(svm, size, port, in); 1957 else 1958 return kvm_emulate_instruction(vcpu, 0); 1959 } 1960 1961 svm->next_rip = svm->vmcb->control.exit_info_2; 1962 1963 return kvm_fast_pio(vcpu, size, port, in); 1964 } 1965 1966 static int nmi_interception(struct kvm_vcpu *vcpu) 1967 { 1968 return 1; 1969 } 1970 1971 static int smi_interception(struct kvm_vcpu *vcpu) 1972 { 1973 return 1; 1974 } 1975 1976 static int intr_interception(struct kvm_vcpu *vcpu) 1977 { 1978 ++vcpu->stat.irq_exits; 1979 return 1; 1980 } 1981 1982 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) 1983 { 1984 struct vcpu_svm *svm = to_svm(vcpu); 1985 struct vmcb *vmcb12; 1986 struct kvm_host_map map; 1987 int ret; 1988 1989 if (nested_svm_check_permissions(vcpu)) 1990 return 1; 1991 1992 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); 1993 if (ret) { 1994 if (ret == -EINVAL) 1995 kvm_inject_gp(vcpu, 0); 1996 return 1; 1997 } 1998 1999 vmcb12 = map.hva; 2000 2001 ret = kvm_skip_emulated_instruction(vcpu); 2002 2003 if (vmload) { 2004 svm_copy_vmloadsave_state(svm->vmcb, vmcb12); 2005 svm->sysenter_eip_hi = 0; 2006 svm->sysenter_esp_hi = 0; 2007 } else { 2008 svm_copy_vmloadsave_state(vmcb12, svm->vmcb); 2009 } 2010 2011 kvm_vcpu_unmap(vcpu, &map, true); 2012 2013 return ret; 2014 } 2015 2016 static int vmload_interception(struct kvm_vcpu *vcpu) 2017 { 2018 return vmload_vmsave_interception(vcpu, true); 2019 } 2020 2021 static int vmsave_interception(struct kvm_vcpu *vcpu) 2022 { 2023 return vmload_vmsave_interception(vcpu, false); 2024 } 2025 2026 static int vmrun_interception(struct kvm_vcpu *vcpu) 2027 { 2028 if (nested_svm_check_permissions(vcpu)) 2029 return 1; 2030 2031 return nested_svm_vmrun(vcpu); 2032 } 2033 2034 enum { 2035 NONE_SVM_INSTR, 2036 SVM_INSTR_VMRUN, 2037 SVM_INSTR_VMLOAD, 2038 SVM_INSTR_VMSAVE, 2039 }; 2040 2041 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ 2042 static int svm_instr_opcode(struct kvm_vcpu *vcpu) 2043 { 2044 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 2045 2046 if (ctxt->b != 0x1 || ctxt->opcode_len != 2) 2047 return NONE_SVM_INSTR; 2048 2049 switch (ctxt->modrm) { 2050 case 0xd8: /* VMRUN */ 2051 return SVM_INSTR_VMRUN; 2052 case 0xda: /* VMLOAD */ 2053 return SVM_INSTR_VMLOAD; 2054 case 0xdb: /* VMSAVE */ 2055 return SVM_INSTR_VMSAVE; 2056 default: 2057 break; 2058 } 2059 2060 return NONE_SVM_INSTR; 2061 } 2062 2063 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) 2064 { 2065 const int guest_mode_exit_codes[] = { 2066 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, 2067 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, 2068 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, 2069 }; 2070 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { 2071 [SVM_INSTR_VMRUN] = vmrun_interception, 2072 [SVM_INSTR_VMLOAD] = vmload_interception, 2073 [SVM_INSTR_VMSAVE] = vmsave_interception, 2074 }; 2075 struct vcpu_svm *svm = to_svm(vcpu); 2076 int ret; 2077 2078 if (is_guest_mode(vcpu)) { 2079 /* Returns '1' or -errno on failure, '0' on success. */ 2080 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); 2081 if (ret) 2082 return ret; 2083 return 1; 2084 } 2085 return svm_instr_handlers[opcode](vcpu); 2086 } 2087 2088 /* 2089 * #GP handling code. Note that #GP can be triggered under the following two 2090 * cases: 2091 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on 2092 * some AMD CPUs when EAX of these instructions are in the reserved memory 2093 * regions (e.g. SMM memory on host). 2094 * 2) VMware backdoor 2095 */ 2096 static int gp_interception(struct kvm_vcpu *vcpu) 2097 { 2098 struct vcpu_svm *svm = to_svm(vcpu); 2099 u32 error_code = svm->vmcb->control.exit_info_1; 2100 int opcode; 2101 2102 /* Both #GP cases have zero error_code */ 2103 if (error_code) 2104 goto reinject; 2105 2106 /* Decode the instruction for usage later */ 2107 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) 2108 goto reinject; 2109 2110 opcode = svm_instr_opcode(vcpu); 2111 2112 if (opcode == NONE_SVM_INSTR) { 2113 if (!enable_vmware_backdoor) 2114 goto reinject; 2115 2116 /* 2117 * VMware backdoor emulation on #GP interception only handles 2118 * IN{S}, OUT{S}, and RDPMC. 2119 */ 2120 if (!is_guest_mode(vcpu)) 2121 return kvm_emulate_instruction(vcpu, 2122 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); 2123 } else { 2124 /* All SVM instructions expect page aligned RAX */ 2125 if (svm->vmcb->save.rax & ~PAGE_MASK) 2126 goto reinject; 2127 2128 return emulate_svm_instr(vcpu, opcode); 2129 } 2130 2131 reinject: 2132 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 2133 return 1; 2134 } 2135 2136 void svm_set_gif(struct vcpu_svm *svm, bool value) 2137 { 2138 if (value) { 2139 /* 2140 * If VGIF is enabled, the STGI intercept is only added to 2141 * detect the opening of the SMI/NMI window; remove it now. 2142 * Likewise, clear the VINTR intercept, we will set it 2143 * again while processing KVM_REQ_EVENT if needed. 2144 */ 2145 if (vgif_enabled(svm)) 2146 svm_clr_intercept(svm, INTERCEPT_STGI); 2147 if (svm_is_intercept(svm, INTERCEPT_VINTR)) 2148 svm_clear_vintr(svm); 2149 2150 enable_gif(svm); 2151 if (svm->vcpu.arch.smi_pending || 2152 svm->vcpu.arch.nmi_pending || 2153 kvm_cpu_has_injectable_intr(&svm->vcpu)) 2154 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 2155 } else { 2156 disable_gif(svm); 2157 2158 /* 2159 * After a CLGI no interrupts should come. But if vGIF is 2160 * in use, we still rely on the VINTR intercept (rather than 2161 * STGI) to detect an open interrupt window. 2162 */ 2163 if (!vgif_enabled(svm)) 2164 svm_clear_vintr(svm); 2165 } 2166 } 2167 2168 static int stgi_interception(struct kvm_vcpu *vcpu) 2169 { 2170 int ret; 2171 2172 if (nested_svm_check_permissions(vcpu)) 2173 return 1; 2174 2175 ret = kvm_skip_emulated_instruction(vcpu); 2176 svm_set_gif(to_svm(vcpu), true); 2177 return ret; 2178 } 2179 2180 static int clgi_interception(struct kvm_vcpu *vcpu) 2181 { 2182 int ret; 2183 2184 if (nested_svm_check_permissions(vcpu)) 2185 return 1; 2186 2187 ret = kvm_skip_emulated_instruction(vcpu); 2188 svm_set_gif(to_svm(vcpu), false); 2189 return ret; 2190 } 2191 2192 static int invlpga_interception(struct kvm_vcpu *vcpu) 2193 { 2194 gva_t gva = kvm_rax_read(vcpu); 2195 u32 asid = kvm_rcx_read(vcpu); 2196 2197 /* FIXME: Handle an address size prefix. */ 2198 if (!is_long_mode(vcpu)) 2199 gva = (u32)gva; 2200 2201 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva); 2202 2203 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 2204 kvm_mmu_invlpg(vcpu, gva); 2205 2206 return kvm_skip_emulated_instruction(vcpu); 2207 } 2208 2209 static int skinit_interception(struct kvm_vcpu *vcpu) 2210 { 2211 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu)); 2212 2213 kvm_queue_exception(vcpu, UD_VECTOR); 2214 return 1; 2215 } 2216 2217 static int task_switch_interception(struct kvm_vcpu *vcpu) 2218 { 2219 struct vcpu_svm *svm = to_svm(vcpu); 2220 u16 tss_selector; 2221 int reason; 2222 int int_type = svm->vmcb->control.exit_int_info & 2223 SVM_EXITINTINFO_TYPE_MASK; 2224 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; 2225 uint32_t type = 2226 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2227 uint32_t idt_v = 2228 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2229 bool has_error_code = false; 2230 u32 error_code = 0; 2231 2232 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2233 2234 if (svm->vmcb->control.exit_info_2 & 2235 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 2236 reason = TASK_SWITCH_IRET; 2237 else if (svm->vmcb->control.exit_info_2 & 2238 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 2239 reason = TASK_SWITCH_JMP; 2240 else if (idt_v) 2241 reason = TASK_SWITCH_GATE; 2242 else 2243 reason = TASK_SWITCH_CALL; 2244 2245 if (reason == TASK_SWITCH_GATE) { 2246 switch (type) { 2247 case SVM_EXITINTINFO_TYPE_NMI: 2248 vcpu->arch.nmi_injected = false; 2249 break; 2250 case SVM_EXITINTINFO_TYPE_EXEPT: 2251 if (svm->vmcb->control.exit_info_2 & 2252 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { 2253 has_error_code = true; 2254 error_code = 2255 (u32)svm->vmcb->control.exit_info_2; 2256 } 2257 kvm_clear_exception_queue(vcpu); 2258 break; 2259 case SVM_EXITINTINFO_TYPE_INTR: 2260 kvm_clear_interrupt_queue(vcpu); 2261 break; 2262 default: 2263 break; 2264 } 2265 } 2266 2267 if (reason != TASK_SWITCH_GATE || 2268 int_type == SVM_EXITINTINFO_TYPE_SOFT || 2269 (int_type == SVM_EXITINTINFO_TYPE_EXEPT && 2270 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { 2271 if (!svm_skip_emulated_instruction(vcpu)) 2272 return 0; 2273 } 2274 2275 if (int_type != SVM_EXITINTINFO_TYPE_SOFT) 2276 int_vec = -1; 2277 2278 return kvm_task_switch(vcpu, tss_selector, int_vec, reason, 2279 has_error_code, error_code); 2280 } 2281 2282 static int iret_interception(struct kvm_vcpu *vcpu) 2283 { 2284 struct vcpu_svm *svm = to_svm(vcpu); 2285 2286 ++vcpu->stat.nmi_window_exits; 2287 vcpu->arch.hflags |= HF_IRET_MASK; 2288 if (!sev_es_guest(vcpu->kvm)) { 2289 svm_clr_intercept(svm, INTERCEPT_IRET); 2290 svm->nmi_iret_rip = kvm_rip_read(vcpu); 2291 } 2292 kvm_make_request(KVM_REQ_EVENT, vcpu); 2293 return 1; 2294 } 2295 2296 static int invlpg_interception(struct kvm_vcpu *vcpu) 2297 { 2298 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2299 return kvm_emulate_instruction(vcpu, 0); 2300 2301 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1); 2302 return kvm_skip_emulated_instruction(vcpu); 2303 } 2304 2305 static int emulate_on_interception(struct kvm_vcpu *vcpu) 2306 { 2307 return kvm_emulate_instruction(vcpu, 0); 2308 } 2309 2310 static int rsm_interception(struct kvm_vcpu *vcpu) 2311 { 2312 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2); 2313 } 2314 2315 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, 2316 unsigned long val) 2317 { 2318 struct vcpu_svm *svm = to_svm(vcpu); 2319 unsigned long cr0 = vcpu->arch.cr0; 2320 bool ret = false; 2321 2322 if (!is_guest_mode(vcpu) || 2323 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) 2324 return false; 2325 2326 cr0 &= ~SVM_CR0_SELECTIVE_MASK; 2327 val &= ~SVM_CR0_SELECTIVE_MASK; 2328 2329 if (cr0 ^ val) { 2330 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 2331 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); 2332 } 2333 2334 return ret; 2335 } 2336 2337 #define CR_VALID (1ULL << 63) 2338 2339 static int cr_interception(struct kvm_vcpu *vcpu) 2340 { 2341 struct vcpu_svm *svm = to_svm(vcpu); 2342 int reg, cr; 2343 unsigned long val; 2344 int err; 2345 2346 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2347 return emulate_on_interception(vcpu); 2348 2349 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) 2350 return emulate_on_interception(vcpu); 2351 2352 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2353 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) 2354 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; 2355 else 2356 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 2357 2358 err = 0; 2359 if (cr >= 16) { /* mov to cr */ 2360 cr -= 16; 2361 val = kvm_register_read(vcpu, reg); 2362 trace_kvm_cr_write(cr, val); 2363 switch (cr) { 2364 case 0: 2365 if (!check_selective_cr0_intercepted(vcpu, val)) 2366 err = kvm_set_cr0(vcpu, val); 2367 else 2368 return 1; 2369 2370 break; 2371 case 3: 2372 err = kvm_set_cr3(vcpu, val); 2373 break; 2374 case 4: 2375 err = kvm_set_cr4(vcpu, val); 2376 break; 2377 case 8: 2378 err = kvm_set_cr8(vcpu, val); 2379 break; 2380 default: 2381 WARN(1, "unhandled write to CR%d", cr); 2382 kvm_queue_exception(vcpu, UD_VECTOR); 2383 return 1; 2384 } 2385 } else { /* mov from cr */ 2386 switch (cr) { 2387 case 0: 2388 val = kvm_read_cr0(vcpu); 2389 break; 2390 case 2: 2391 val = vcpu->arch.cr2; 2392 break; 2393 case 3: 2394 val = kvm_read_cr3(vcpu); 2395 break; 2396 case 4: 2397 val = kvm_read_cr4(vcpu); 2398 break; 2399 case 8: 2400 val = kvm_get_cr8(vcpu); 2401 break; 2402 default: 2403 WARN(1, "unhandled read from CR%d", cr); 2404 kvm_queue_exception(vcpu, UD_VECTOR); 2405 return 1; 2406 } 2407 kvm_register_write(vcpu, reg, val); 2408 trace_kvm_cr_read(cr, val); 2409 } 2410 return kvm_complete_insn_gp(vcpu, err); 2411 } 2412 2413 static int cr_trap(struct kvm_vcpu *vcpu) 2414 { 2415 struct vcpu_svm *svm = to_svm(vcpu); 2416 unsigned long old_value, new_value; 2417 unsigned int cr; 2418 int ret = 0; 2419 2420 new_value = (unsigned long)svm->vmcb->control.exit_info_1; 2421 2422 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP; 2423 switch (cr) { 2424 case 0: 2425 old_value = kvm_read_cr0(vcpu); 2426 svm_set_cr0(vcpu, new_value); 2427 2428 kvm_post_set_cr0(vcpu, old_value, new_value); 2429 break; 2430 case 4: 2431 old_value = kvm_read_cr4(vcpu); 2432 svm_set_cr4(vcpu, new_value); 2433 2434 kvm_post_set_cr4(vcpu, old_value, new_value); 2435 break; 2436 case 8: 2437 ret = kvm_set_cr8(vcpu, new_value); 2438 break; 2439 default: 2440 WARN(1, "unhandled CR%d write trap", cr); 2441 kvm_queue_exception(vcpu, UD_VECTOR); 2442 return 1; 2443 } 2444 2445 return kvm_complete_insn_gp(vcpu, ret); 2446 } 2447 2448 static int dr_interception(struct kvm_vcpu *vcpu) 2449 { 2450 struct vcpu_svm *svm = to_svm(vcpu); 2451 int reg, dr; 2452 unsigned long val; 2453 int err = 0; 2454 2455 if (vcpu->guest_debug == 0) { 2456 /* 2457 * No more DR vmexits; force a reload of the debug registers 2458 * and reenter on this instruction. The next vmexit will 2459 * retrieve the full state of the debug registers. 2460 */ 2461 clr_dr_intercepts(svm); 2462 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 2463 return 1; 2464 } 2465 2466 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) 2467 return emulate_on_interception(vcpu); 2468 2469 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2470 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; 2471 if (dr >= 16) { /* mov to DRn */ 2472 dr -= 16; 2473 val = kvm_register_read(vcpu, reg); 2474 err = kvm_set_dr(vcpu, dr, val); 2475 } else { 2476 kvm_get_dr(vcpu, dr, &val); 2477 kvm_register_write(vcpu, reg, val); 2478 } 2479 2480 return kvm_complete_insn_gp(vcpu, err); 2481 } 2482 2483 static int cr8_write_interception(struct kvm_vcpu *vcpu) 2484 { 2485 int r; 2486 2487 u8 cr8_prev = kvm_get_cr8(vcpu); 2488 /* instruction emulation calls kvm_set_cr8() */ 2489 r = cr_interception(vcpu); 2490 if (lapic_in_kernel(vcpu)) 2491 return r; 2492 if (cr8_prev <= kvm_get_cr8(vcpu)) 2493 return r; 2494 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 2495 return 0; 2496 } 2497 2498 static int efer_trap(struct kvm_vcpu *vcpu) 2499 { 2500 struct msr_data msr_info; 2501 int ret; 2502 2503 /* 2504 * Clear the EFER_SVME bit from EFER. The SVM code always sets this 2505 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against 2506 * whether the guest has X86_FEATURE_SVM - this avoids a failure if 2507 * the guest doesn't have X86_FEATURE_SVM. 2508 */ 2509 msr_info.host_initiated = false; 2510 msr_info.index = MSR_EFER; 2511 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME; 2512 ret = kvm_set_msr_common(vcpu, &msr_info); 2513 2514 return kvm_complete_insn_gp(vcpu, ret); 2515 } 2516 2517 static int svm_get_msr_feature(struct kvm_msr_entry *msr) 2518 { 2519 msr->data = 0; 2520 2521 switch (msr->index) { 2522 case MSR_F10H_DECFG: 2523 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) 2524 msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; 2525 break; 2526 case MSR_IA32_PERF_CAPABILITIES: 2527 return 0; 2528 default: 2529 return KVM_MSR_RET_INVALID; 2530 } 2531 2532 return 0; 2533 } 2534 2535 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2536 { 2537 struct vcpu_svm *svm = to_svm(vcpu); 2538 2539 switch (msr_info->index) { 2540 case MSR_AMD64_TSC_RATIO: 2541 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) 2542 return 1; 2543 msr_info->data = svm->tsc_ratio_msr; 2544 break; 2545 case MSR_STAR: 2546 msr_info->data = svm->vmcb01.ptr->save.star; 2547 break; 2548 #ifdef CONFIG_X86_64 2549 case MSR_LSTAR: 2550 msr_info->data = svm->vmcb01.ptr->save.lstar; 2551 break; 2552 case MSR_CSTAR: 2553 msr_info->data = svm->vmcb01.ptr->save.cstar; 2554 break; 2555 case MSR_KERNEL_GS_BASE: 2556 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; 2557 break; 2558 case MSR_SYSCALL_MASK: 2559 msr_info->data = svm->vmcb01.ptr->save.sfmask; 2560 break; 2561 #endif 2562 case MSR_IA32_SYSENTER_CS: 2563 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; 2564 break; 2565 case MSR_IA32_SYSENTER_EIP: 2566 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; 2567 if (guest_cpuid_is_intel(vcpu)) 2568 msr_info->data |= (u64)svm->sysenter_eip_hi << 32; 2569 break; 2570 case MSR_IA32_SYSENTER_ESP: 2571 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; 2572 if (guest_cpuid_is_intel(vcpu)) 2573 msr_info->data |= (u64)svm->sysenter_esp_hi << 32; 2574 break; 2575 case MSR_TSC_AUX: 2576 msr_info->data = svm->tsc_aux; 2577 break; 2578 /* 2579 * Nobody will change the following 5 values in the VMCB so we can 2580 * safely return them on rdmsr. They will always be 0 until LBRV is 2581 * implemented. 2582 */ 2583 case MSR_IA32_DEBUGCTLMSR: 2584 msr_info->data = svm->vmcb->save.dbgctl; 2585 break; 2586 case MSR_IA32_LASTBRANCHFROMIP: 2587 msr_info->data = svm->vmcb->save.br_from; 2588 break; 2589 case MSR_IA32_LASTBRANCHTOIP: 2590 msr_info->data = svm->vmcb->save.br_to; 2591 break; 2592 case MSR_IA32_LASTINTFROMIP: 2593 msr_info->data = svm->vmcb->save.last_excp_from; 2594 break; 2595 case MSR_IA32_LASTINTTOIP: 2596 msr_info->data = svm->vmcb->save.last_excp_to; 2597 break; 2598 case MSR_VM_HSAVE_PA: 2599 msr_info->data = svm->nested.hsave_msr; 2600 break; 2601 case MSR_VM_CR: 2602 msr_info->data = svm->nested.vm_cr_msr; 2603 break; 2604 case MSR_IA32_SPEC_CTRL: 2605 if (!msr_info->host_initiated && 2606 !guest_has_spec_ctrl_msr(vcpu)) 2607 return 1; 2608 2609 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2610 msr_info->data = svm->vmcb->save.spec_ctrl; 2611 else 2612 msr_info->data = svm->spec_ctrl; 2613 break; 2614 case MSR_AMD64_VIRT_SPEC_CTRL: 2615 if (!msr_info->host_initiated && 2616 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2617 return 1; 2618 2619 msr_info->data = svm->virt_spec_ctrl; 2620 break; 2621 case MSR_F15H_IC_CFG: { 2622 2623 int family, model; 2624 2625 family = guest_cpuid_family(vcpu); 2626 model = guest_cpuid_model(vcpu); 2627 2628 if (family < 0 || model < 0) 2629 return kvm_get_msr_common(vcpu, msr_info); 2630 2631 msr_info->data = 0; 2632 2633 if (family == 0x15 && 2634 (model >= 0x2 && model < 0x20)) 2635 msr_info->data = 0x1E; 2636 } 2637 break; 2638 case MSR_F10H_DECFG: 2639 msr_info->data = svm->msr_decfg; 2640 break; 2641 default: 2642 return kvm_get_msr_common(vcpu, msr_info); 2643 } 2644 return 0; 2645 } 2646 2647 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2648 { 2649 struct vcpu_svm *svm = to_svm(vcpu); 2650 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) 2651 return kvm_complete_insn_gp(vcpu, err); 2652 2653 ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); 2654 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 2655 X86_TRAP_GP | 2656 SVM_EVTINJ_TYPE_EXEPT | 2657 SVM_EVTINJ_VALID); 2658 return 1; 2659 } 2660 2661 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) 2662 { 2663 struct vcpu_svm *svm = to_svm(vcpu); 2664 int svm_dis, chg_mask; 2665 2666 if (data & ~SVM_VM_CR_VALID_MASK) 2667 return 1; 2668 2669 chg_mask = SVM_VM_CR_VALID_MASK; 2670 2671 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) 2672 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); 2673 2674 svm->nested.vm_cr_msr &= ~chg_mask; 2675 svm->nested.vm_cr_msr |= (data & chg_mask); 2676 2677 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; 2678 2679 /* check for svm_disable while efer.svme is set */ 2680 if (svm_dis && (vcpu->arch.efer & EFER_SVME)) 2681 return 1; 2682 2683 return 0; 2684 } 2685 2686 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2687 { 2688 struct vcpu_svm *svm = to_svm(vcpu); 2689 int r; 2690 2691 u32 ecx = msr->index; 2692 u64 data = msr->data; 2693 switch (ecx) { 2694 case MSR_AMD64_TSC_RATIO: 2695 2696 if (!svm->tsc_scaling_enabled) { 2697 2698 if (!msr->host_initiated) 2699 return 1; 2700 /* 2701 * In case TSC scaling is not enabled, always 2702 * leave this MSR at the default value. 2703 * 2704 * Due to bug in qemu 6.2.0, it would try to set 2705 * this msr to 0 if tsc scaling is not enabled. 2706 * Ignore this value as well. 2707 */ 2708 if (data != 0 && data != svm->tsc_ratio_msr) 2709 return 1; 2710 break; 2711 } 2712 2713 if (data & SVM_TSC_RATIO_RSVD) 2714 return 1; 2715 2716 svm->tsc_ratio_msr = data; 2717 2718 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) 2719 nested_svm_update_tsc_ratio_msr(vcpu); 2720 2721 break; 2722 case MSR_IA32_CR_PAT: 2723 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 2724 return 1; 2725 vcpu->arch.pat = data; 2726 svm->vmcb01.ptr->save.g_pat = data; 2727 if (is_guest_mode(vcpu)) 2728 nested_vmcb02_compute_g_pat(svm); 2729 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 2730 break; 2731 case MSR_IA32_SPEC_CTRL: 2732 if (!msr->host_initiated && 2733 !guest_has_spec_ctrl_msr(vcpu)) 2734 return 1; 2735 2736 if (kvm_spec_ctrl_test_value(data)) 2737 return 1; 2738 2739 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2740 svm->vmcb->save.spec_ctrl = data; 2741 else 2742 svm->spec_ctrl = data; 2743 if (!data) 2744 break; 2745 2746 /* 2747 * For non-nested: 2748 * When it's written (to non-zero) for the first time, pass 2749 * it through. 2750 * 2751 * For nested: 2752 * The handling of the MSR bitmap for L2 guests is done in 2753 * nested_svm_vmrun_msrpm. 2754 * We update the L1 MSR bit as well since it will end up 2755 * touching the MSR anyway now. 2756 */ 2757 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 2758 break; 2759 case MSR_IA32_PRED_CMD: 2760 if (!msr->host_initiated && 2761 !guest_has_pred_cmd_msr(vcpu)) 2762 return 1; 2763 2764 if (data & ~PRED_CMD_IBPB) 2765 return 1; 2766 if (!boot_cpu_has(X86_FEATURE_IBPB)) 2767 return 1; 2768 if (!data) 2769 break; 2770 2771 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); 2772 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); 2773 break; 2774 case MSR_AMD64_VIRT_SPEC_CTRL: 2775 if (!msr->host_initiated && 2776 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2777 return 1; 2778 2779 if (data & ~SPEC_CTRL_SSBD) 2780 return 1; 2781 2782 svm->virt_spec_ctrl = data; 2783 break; 2784 case MSR_STAR: 2785 svm->vmcb01.ptr->save.star = data; 2786 break; 2787 #ifdef CONFIG_X86_64 2788 case MSR_LSTAR: 2789 svm->vmcb01.ptr->save.lstar = data; 2790 break; 2791 case MSR_CSTAR: 2792 svm->vmcb01.ptr->save.cstar = data; 2793 break; 2794 case MSR_KERNEL_GS_BASE: 2795 svm->vmcb01.ptr->save.kernel_gs_base = data; 2796 break; 2797 case MSR_SYSCALL_MASK: 2798 svm->vmcb01.ptr->save.sfmask = data; 2799 break; 2800 #endif 2801 case MSR_IA32_SYSENTER_CS: 2802 svm->vmcb01.ptr->save.sysenter_cs = data; 2803 break; 2804 case MSR_IA32_SYSENTER_EIP: 2805 svm->vmcb01.ptr->save.sysenter_eip = (u32)data; 2806 /* 2807 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs 2808 * when we spoof an Intel vendor ID (for cross vendor migration). 2809 * In this case we use this intercept to track the high 2810 * 32 bit part of these msrs to support Intel's 2811 * implementation of SYSENTER/SYSEXIT. 2812 */ 2813 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; 2814 break; 2815 case MSR_IA32_SYSENTER_ESP: 2816 svm->vmcb01.ptr->save.sysenter_esp = (u32)data; 2817 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; 2818 break; 2819 case MSR_TSC_AUX: 2820 /* 2821 * TSC_AUX is usually changed only during boot and never read 2822 * directly. Intercept TSC_AUX instead of exposing it to the 2823 * guest via direct_access_msrs, and switch it via user return. 2824 */ 2825 preempt_disable(); 2826 r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); 2827 preempt_enable(); 2828 if (r) 2829 return 1; 2830 2831 svm->tsc_aux = data; 2832 break; 2833 case MSR_IA32_DEBUGCTLMSR: 2834 if (!lbrv) { 2835 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 2836 __func__, data); 2837 break; 2838 } 2839 if (data & DEBUGCTL_RESERVED_BITS) 2840 return 1; 2841 2842 svm->vmcb->save.dbgctl = data; 2843 vmcb_mark_dirty(svm->vmcb, VMCB_LBR); 2844 if (data & (1ULL<<0)) 2845 svm_enable_lbrv(vcpu); 2846 else 2847 svm_disable_lbrv(vcpu); 2848 break; 2849 case MSR_VM_HSAVE_PA: 2850 /* 2851 * Old kernels did not validate the value written to 2852 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid 2853 * value to allow live migrating buggy or malicious guests 2854 * originating from those kernels. 2855 */ 2856 if (!msr->host_initiated && !page_address_valid(vcpu, data)) 2857 return 1; 2858 2859 svm->nested.hsave_msr = data & PAGE_MASK; 2860 break; 2861 case MSR_VM_CR: 2862 return svm_set_vm_cr(vcpu, data); 2863 case MSR_VM_IGNNE: 2864 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 2865 break; 2866 case MSR_F10H_DECFG: { 2867 struct kvm_msr_entry msr_entry; 2868 2869 msr_entry.index = msr->index; 2870 if (svm_get_msr_feature(&msr_entry)) 2871 return 1; 2872 2873 /* Check the supported bits */ 2874 if (data & ~msr_entry.data) 2875 return 1; 2876 2877 /* Don't allow the guest to change a bit, #GP */ 2878 if (!msr->host_initiated && (data ^ msr_entry.data)) 2879 return 1; 2880 2881 svm->msr_decfg = data; 2882 break; 2883 } 2884 default: 2885 return kvm_set_msr_common(vcpu, msr); 2886 } 2887 return 0; 2888 } 2889 2890 static int msr_interception(struct kvm_vcpu *vcpu) 2891 { 2892 if (to_svm(vcpu)->vmcb->control.exit_info_1) 2893 return kvm_emulate_wrmsr(vcpu); 2894 else 2895 return kvm_emulate_rdmsr(vcpu); 2896 } 2897 2898 static int interrupt_window_interception(struct kvm_vcpu *vcpu) 2899 { 2900 kvm_make_request(KVM_REQ_EVENT, vcpu); 2901 svm_clear_vintr(to_svm(vcpu)); 2902 2903 /* 2904 * For AVIC, the only reason to end up here is ExtINTs. 2905 * In this case AVIC was temporarily disabled for 2906 * requesting the IRQ window and we have to re-enable it. 2907 */ 2908 kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 2909 2910 ++vcpu->stat.irq_window_exits; 2911 return 1; 2912 } 2913 2914 static int pause_interception(struct kvm_vcpu *vcpu) 2915 { 2916 bool in_kernel; 2917 2918 /* 2919 * CPL is not made available for an SEV-ES guest, therefore 2920 * vcpu->arch.preempted_in_kernel can never be true. Just 2921 * set in_kernel to false as well. 2922 */ 2923 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; 2924 2925 if (!kvm_pause_in_guest(vcpu->kvm)) 2926 grow_ple_window(vcpu); 2927 2928 kvm_vcpu_on_spin(vcpu, in_kernel); 2929 return kvm_skip_emulated_instruction(vcpu); 2930 } 2931 2932 static int invpcid_interception(struct kvm_vcpu *vcpu) 2933 { 2934 struct vcpu_svm *svm = to_svm(vcpu); 2935 unsigned long type; 2936 gva_t gva; 2937 2938 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 2939 kvm_queue_exception(vcpu, UD_VECTOR); 2940 return 1; 2941 } 2942 2943 /* 2944 * For an INVPCID intercept: 2945 * EXITINFO1 provides the linear address of the memory operand. 2946 * EXITINFO2 provides the contents of the register operand. 2947 */ 2948 type = svm->vmcb->control.exit_info_2; 2949 gva = svm->vmcb->control.exit_info_1; 2950 2951 return kvm_handle_invpcid(vcpu, type, gva); 2952 } 2953 2954 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { 2955 [SVM_EXIT_READ_CR0] = cr_interception, 2956 [SVM_EXIT_READ_CR3] = cr_interception, 2957 [SVM_EXIT_READ_CR4] = cr_interception, 2958 [SVM_EXIT_READ_CR8] = cr_interception, 2959 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, 2960 [SVM_EXIT_WRITE_CR0] = cr_interception, 2961 [SVM_EXIT_WRITE_CR3] = cr_interception, 2962 [SVM_EXIT_WRITE_CR4] = cr_interception, 2963 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 2964 [SVM_EXIT_READ_DR0] = dr_interception, 2965 [SVM_EXIT_READ_DR1] = dr_interception, 2966 [SVM_EXIT_READ_DR2] = dr_interception, 2967 [SVM_EXIT_READ_DR3] = dr_interception, 2968 [SVM_EXIT_READ_DR4] = dr_interception, 2969 [SVM_EXIT_READ_DR5] = dr_interception, 2970 [SVM_EXIT_READ_DR6] = dr_interception, 2971 [SVM_EXIT_READ_DR7] = dr_interception, 2972 [SVM_EXIT_WRITE_DR0] = dr_interception, 2973 [SVM_EXIT_WRITE_DR1] = dr_interception, 2974 [SVM_EXIT_WRITE_DR2] = dr_interception, 2975 [SVM_EXIT_WRITE_DR3] = dr_interception, 2976 [SVM_EXIT_WRITE_DR4] = dr_interception, 2977 [SVM_EXIT_WRITE_DR5] = dr_interception, 2978 [SVM_EXIT_WRITE_DR6] = dr_interception, 2979 [SVM_EXIT_WRITE_DR7] = dr_interception, 2980 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2981 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2982 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 2983 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 2984 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 2985 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 2986 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, 2987 [SVM_EXIT_INTR] = intr_interception, 2988 [SVM_EXIT_NMI] = nmi_interception, 2989 [SVM_EXIT_SMI] = smi_interception, 2990 [SVM_EXIT_VINTR] = interrupt_window_interception, 2991 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, 2992 [SVM_EXIT_CPUID] = kvm_emulate_cpuid, 2993 [SVM_EXIT_IRET] = iret_interception, 2994 [SVM_EXIT_INVD] = kvm_emulate_invd, 2995 [SVM_EXIT_PAUSE] = pause_interception, 2996 [SVM_EXIT_HLT] = kvm_emulate_halt, 2997 [SVM_EXIT_INVLPG] = invlpg_interception, 2998 [SVM_EXIT_INVLPGA] = invlpga_interception, 2999 [SVM_EXIT_IOIO] = io_interception, 3000 [SVM_EXIT_MSR] = msr_interception, 3001 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 3002 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 3003 [SVM_EXIT_VMRUN] = vmrun_interception, 3004 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, 3005 [SVM_EXIT_VMLOAD] = vmload_interception, 3006 [SVM_EXIT_VMSAVE] = vmsave_interception, 3007 [SVM_EXIT_STGI] = stgi_interception, 3008 [SVM_EXIT_CLGI] = clgi_interception, 3009 [SVM_EXIT_SKINIT] = skinit_interception, 3010 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, 3011 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, 3012 [SVM_EXIT_MONITOR] = kvm_emulate_monitor, 3013 [SVM_EXIT_MWAIT] = kvm_emulate_mwait, 3014 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, 3015 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, 3016 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, 3017 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, 3018 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, 3019 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, 3020 [SVM_EXIT_INVPCID] = invpcid_interception, 3021 [SVM_EXIT_NPF] = npf_interception, 3022 [SVM_EXIT_RSM] = rsm_interception, 3023 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 3024 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, 3025 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, 3026 }; 3027 3028 static void dump_vmcb(struct kvm_vcpu *vcpu) 3029 { 3030 struct vcpu_svm *svm = to_svm(vcpu); 3031 struct vmcb_control_area *control = &svm->vmcb->control; 3032 struct vmcb_save_area *save = &svm->vmcb->save; 3033 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; 3034 3035 if (!dump_invalid_vmcb) { 3036 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3037 return; 3038 } 3039 3040 pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", 3041 svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); 3042 pr_err("VMCB Control Area:\n"); 3043 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); 3044 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); 3045 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); 3046 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); 3047 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); 3048 pr_err("%-20s%08x %08x\n", "intercepts:", 3049 control->intercepts[INTERCEPT_WORD3], 3050 control->intercepts[INTERCEPT_WORD4]); 3051 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); 3052 pr_err("%-20s%d\n", "pause filter threshold:", 3053 control->pause_filter_thresh); 3054 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); 3055 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); 3056 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); 3057 pr_err("%-20s%d\n", "asid:", control->asid); 3058 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); 3059 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); 3060 pr_err("%-20s%08x\n", "int_vector:", control->int_vector); 3061 pr_err("%-20s%08x\n", "int_state:", control->int_state); 3062 pr_err("%-20s%08x\n", "exit_code:", control->exit_code); 3063 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); 3064 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); 3065 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); 3066 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); 3067 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); 3068 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); 3069 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 3070 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); 3071 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 3072 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 3073 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 3074 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 3075 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 3076 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); 3077 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); 3078 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); 3079 pr_err("VMCB State Save Area:\n"); 3080 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3081 "es:", 3082 save->es.selector, save->es.attrib, 3083 save->es.limit, save->es.base); 3084 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3085 "cs:", 3086 save->cs.selector, save->cs.attrib, 3087 save->cs.limit, save->cs.base); 3088 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3089 "ss:", 3090 save->ss.selector, save->ss.attrib, 3091 save->ss.limit, save->ss.base); 3092 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3093 "ds:", 3094 save->ds.selector, save->ds.attrib, 3095 save->ds.limit, save->ds.base); 3096 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3097 "fs:", 3098 save01->fs.selector, save01->fs.attrib, 3099 save01->fs.limit, save01->fs.base); 3100 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3101 "gs:", 3102 save01->gs.selector, save01->gs.attrib, 3103 save01->gs.limit, save01->gs.base); 3104 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3105 "gdtr:", 3106 save->gdtr.selector, save->gdtr.attrib, 3107 save->gdtr.limit, save->gdtr.base); 3108 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3109 "ldtr:", 3110 save01->ldtr.selector, save01->ldtr.attrib, 3111 save01->ldtr.limit, save01->ldtr.base); 3112 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3113 "idtr:", 3114 save->idtr.selector, save->idtr.attrib, 3115 save->idtr.limit, save->idtr.base); 3116 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3117 "tr:", 3118 save01->tr.selector, save01->tr.attrib, 3119 save01->tr.limit, save01->tr.base); 3120 pr_err("cpl: %d efer: %016llx\n", 3121 save->cpl, save->efer); 3122 pr_err("%-15s %016llx %-13s %016llx\n", 3123 "cr0:", save->cr0, "cr2:", save->cr2); 3124 pr_err("%-15s %016llx %-13s %016llx\n", 3125 "cr3:", save->cr3, "cr4:", save->cr4); 3126 pr_err("%-15s %016llx %-13s %016llx\n", 3127 "dr6:", save->dr6, "dr7:", save->dr7); 3128 pr_err("%-15s %016llx %-13s %016llx\n", 3129 "rip:", save->rip, "rflags:", save->rflags); 3130 pr_err("%-15s %016llx %-13s %016llx\n", 3131 "rsp:", save->rsp, "rax:", save->rax); 3132 pr_err("%-15s %016llx %-13s %016llx\n", 3133 "star:", save01->star, "lstar:", save01->lstar); 3134 pr_err("%-15s %016llx %-13s %016llx\n", 3135 "cstar:", save01->cstar, "sfmask:", save01->sfmask); 3136 pr_err("%-15s %016llx %-13s %016llx\n", 3137 "kernel_gs_base:", save01->kernel_gs_base, 3138 "sysenter_cs:", save01->sysenter_cs); 3139 pr_err("%-15s %016llx %-13s %016llx\n", 3140 "sysenter_esp:", save01->sysenter_esp, 3141 "sysenter_eip:", save01->sysenter_eip); 3142 pr_err("%-15s %016llx %-13s %016llx\n", 3143 "gpat:", save->g_pat, "dbgctl:", save->dbgctl); 3144 pr_err("%-15s %016llx %-13s %016llx\n", 3145 "br_from:", save->br_from, "br_to:", save->br_to); 3146 pr_err("%-15s %016llx %-13s %016llx\n", 3147 "excp_from:", save->last_excp_from, 3148 "excp_to:", save->last_excp_to); 3149 } 3150 3151 static bool svm_check_exit_valid(u64 exit_code) 3152 { 3153 return (exit_code < ARRAY_SIZE(svm_exit_handlers) && 3154 svm_exit_handlers[exit_code]); 3155 } 3156 3157 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) 3158 { 3159 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); 3160 dump_vmcb(vcpu); 3161 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3162 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 3163 vcpu->run->internal.ndata = 2; 3164 vcpu->run->internal.data[0] = exit_code; 3165 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 3166 return 0; 3167 } 3168 3169 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) 3170 { 3171 if (!svm_check_exit_valid(exit_code)) 3172 return svm_handle_invalid_exit(vcpu, exit_code); 3173 3174 #ifdef CONFIG_RETPOLINE 3175 if (exit_code == SVM_EXIT_MSR) 3176 return msr_interception(vcpu); 3177 else if (exit_code == SVM_EXIT_VINTR) 3178 return interrupt_window_interception(vcpu); 3179 else if (exit_code == SVM_EXIT_INTR) 3180 return intr_interception(vcpu); 3181 else if (exit_code == SVM_EXIT_HLT) 3182 return kvm_emulate_halt(vcpu); 3183 else if (exit_code == SVM_EXIT_NPF) 3184 return npf_interception(vcpu); 3185 #endif 3186 return svm_exit_handlers[exit_code](vcpu); 3187 } 3188 3189 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 3190 u64 *info1, u64 *info2, 3191 u32 *intr_info, u32 *error_code) 3192 { 3193 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 3194 3195 *reason = control->exit_code; 3196 *info1 = control->exit_info_1; 3197 *info2 = control->exit_info_2; 3198 *intr_info = control->exit_int_info; 3199 if ((*intr_info & SVM_EXITINTINFO_VALID) && 3200 (*intr_info & SVM_EXITINTINFO_VALID_ERR)) 3201 *error_code = control->exit_int_info_err; 3202 else 3203 *error_code = 0; 3204 } 3205 3206 static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 3207 { 3208 struct vcpu_svm *svm = to_svm(vcpu); 3209 struct kvm_run *kvm_run = vcpu->run; 3210 u32 exit_code = svm->vmcb->control.exit_code; 3211 3212 trace_kvm_exit(vcpu, KVM_ISA_SVM); 3213 3214 /* SEV-ES guests must use the CR write traps to track CR registers. */ 3215 if (!sev_es_guest(vcpu->kvm)) { 3216 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) 3217 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3218 if (npt_enabled) 3219 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3220 } 3221 3222 if (is_guest_mode(vcpu)) { 3223 int vmexit; 3224 3225 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); 3226 3227 vmexit = nested_svm_exit_special(svm); 3228 3229 if (vmexit == NESTED_EXIT_CONTINUE) 3230 vmexit = nested_svm_exit_handled(svm); 3231 3232 if (vmexit == NESTED_EXIT_DONE) 3233 return 1; 3234 } 3235 3236 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 3237 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3238 kvm_run->fail_entry.hardware_entry_failure_reason 3239 = svm->vmcb->control.exit_code; 3240 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 3241 dump_vmcb(vcpu); 3242 return 0; 3243 } 3244 3245 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 3246 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3247 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && 3248 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) 3249 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " 3250 "exit_code 0x%x\n", 3251 __func__, svm->vmcb->control.exit_int_info, 3252 exit_code); 3253 3254 if (exit_fastpath != EXIT_FASTPATH_NONE) 3255 return 1; 3256 3257 return svm_invoke_exit_handler(vcpu, exit_code); 3258 } 3259 3260 static void reload_tss(struct kvm_vcpu *vcpu) 3261 { 3262 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3263 3264 sd->tss_desc->type = 9; /* available 32/64-bit TSS */ 3265 load_TR_desc(); 3266 } 3267 3268 static void pre_svm_run(struct kvm_vcpu *vcpu) 3269 { 3270 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3271 struct vcpu_svm *svm = to_svm(vcpu); 3272 3273 /* 3274 * If the previous vmrun of the vmcb occurred on a different physical 3275 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's 3276 * vmcb clean bits are per logical CPU, as are KVM's asid assignments. 3277 */ 3278 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) { 3279 svm->current_vmcb->asid_generation = 0; 3280 vmcb_mark_all_dirty(svm->vmcb); 3281 svm->current_vmcb->cpu = vcpu->cpu; 3282 } 3283 3284 if (sev_guest(vcpu->kvm)) 3285 return pre_sev_run(svm, vcpu->cpu); 3286 3287 /* FIXME: handle wraparound of asid_generation */ 3288 if (svm->current_vmcb->asid_generation != sd->asid_generation) 3289 new_asid(svm, sd); 3290 } 3291 3292 static void svm_inject_nmi(struct kvm_vcpu *vcpu) 3293 { 3294 struct vcpu_svm *svm = to_svm(vcpu); 3295 3296 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3297 vcpu->arch.hflags |= HF_NMI_MASK; 3298 if (!sev_es_guest(vcpu->kvm)) 3299 svm_set_intercept(svm, INTERCEPT_IRET); 3300 ++vcpu->stat.nmi_injections; 3301 } 3302 3303 static void svm_inject_irq(struct kvm_vcpu *vcpu) 3304 { 3305 struct vcpu_svm *svm = to_svm(vcpu); 3306 3307 BUG_ON(!(gif_set(svm))); 3308 3309 trace_kvm_inj_virq(vcpu->arch.interrupt.nr); 3310 ++vcpu->stat.irq_injections; 3311 3312 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 3313 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 3314 } 3315 3316 void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, 3317 int trig_mode, int vector) 3318 { 3319 /* 3320 * vcpu->arch.apicv_active must be read after vcpu->mode. 3321 * Pairs with smp_store_release in vcpu_enter_guest. 3322 */ 3323 bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); 3324 3325 if (!READ_ONCE(vcpu->arch.apicv_active)) { 3326 /* Process the interrupt via inject_pending_event */ 3327 kvm_make_request(KVM_REQ_EVENT, vcpu); 3328 kvm_vcpu_kick(vcpu); 3329 return; 3330 } 3331 3332 trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); 3333 if (in_guest_mode) { 3334 /* 3335 * Signal the doorbell to tell hardware to inject the IRQ. If 3336 * the vCPU exits the guest before the doorbell chimes, hardware 3337 * will automatically process AVIC interrupts at the next VMRUN. 3338 */ 3339 avic_ring_doorbell(vcpu); 3340 } else { 3341 /* 3342 * Wake the vCPU if it was blocking. KVM will then detect the 3343 * pending IRQ when checking if the vCPU has a wake event. 3344 */ 3345 kvm_vcpu_wake_up(vcpu); 3346 } 3347 } 3348 3349 static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, 3350 int trig_mode, int vector) 3351 { 3352 kvm_lapic_set_irr(vector, apic); 3353 3354 /* 3355 * Pairs with the smp_mb_*() after setting vcpu->guest_mode in 3356 * vcpu_enter_guest() to ensure the write to the vIRR is ordered before 3357 * the read of guest_mode. This guarantees that either VMRUN will see 3358 * and process the new vIRR entry, or that svm_complete_interrupt_delivery 3359 * will signal the doorbell if the CPU has already entered the guest. 3360 */ 3361 smp_mb__after_atomic(); 3362 svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); 3363 } 3364 3365 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 3366 { 3367 struct vcpu_svm *svm = to_svm(vcpu); 3368 3369 /* 3370 * SEV-ES guests must always keep the CR intercepts cleared. CR 3371 * tracking is done using the CR write traps. 3372 */ 3373 if (sev_es_guest(vcpu->kvm)) 3374 return; 3375 3376 if (nested_svm_virtualize_tpr(vcpu)) 3377 return; 3378 3379 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 3380 3381 if (irr == -1) 3382 return; 3383 3384 if (tpr >= irr) 3385 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 3386 } 3387 3388 bool svm_nmi_blocked(struct kvm_vcpu *vcpu) 3389 { 3390 struct vcpu_svm *svm = to_svm(vcpu); 3391 struct vmcb *vmcb = svm->vmcb; 3392 bool ret; 3393 3394 if (!gif_set(svm)) 3395 return true; 3396 3397 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3398 return false; 3399 3400 ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || 3401 (vcpu->arch.hflags & HF_NMI_MASK); 3402 3403 return ret; 3404 } 3405 3406 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3407 { 3408 struct vcpu_svm *svm = to_svm(vcpu); 3409 if (svm->nested.nested_run_pending) 3410 return -EBUSY; 3411 3412 if (svm_nmi_blocked(vcpu)) 3413 return 0; 3414 3415 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 3416 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3417 return -EBUSY; 3418 return 1; 3419 } 3420 3421 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 3422 { 3423 return !!(vcpu->arch.hflags & HF_NMI_MASK); 3424 } 3425 3426 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 3427 { 3428 struct vcpu_svm *svm = to_svm(vcpu); 3429 3430 if (masked) { 3431 vcpu->arch.hflags |= HF_NMI_MASK; 3432 if (!sev_es_guest(vcpu->kvm)) 3433 svm_set_intercept(svm, INTERCEPT_IRET); 3434 } else { 3435 vcpu->arch.hflags &= ~HF_NMI_MASK; 3436 if (!sev_es_guest(vcpu->kvm)) 3437 svm_clr_intercept(svm, INTERCEPT_IRET); 3438 } 3439 } 3440 3441 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) 3442 { 3443 struct vcpu_svm *svm = to_svm(vcpu); 3444 struct vmcb *vmcb = svm->vmcb; 3445 3446 if (!gif_set(svm)) 3447 return true; 3448 3449 if (is_guest_mode(vcpu)) { 3450 /* As long as interrupts are being delivered... */ 3451 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) 3452 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) 3453 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) 3454 return true; 3455 3456 /* ... vmexits aren't blocked by the interrupt shadow */ 3457 if (nested_exit_on_intr(svm)) 3458 return false; 3459 } else { 3460 if (!svm_get_if_flag(vcpu)) 3461 return true; 3462 } 3463 3464 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); 3465 } 3466 3467 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3468 { 3469 struct vcpu_svm *svm = to_svm(vcpu); 3470 3471 if (svm->nested.nested_run_pending) 3472 return -EBUSY; 3473 3474 if (svm_interrupt_blocked(vcpu)) 3475 return 0; 3476 3477 /* 3478 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 3479 * e.g. if the IRQ arrived asynchronously after checking nested events. 3480 */ 3481 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) 3482 return -EBUSY; 3483 3484 return 1; 3485 } 3486 3487 static void svm_enable_irq_window(struct kvm_vcpu *vcpu) 3488 { 3489 struct vcpu_svm *svm = to_svm(vcpu); 3490 3491 /* 3492 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 3493 * 1, because that's a separate STGI/VMRUN intercept. The next time we 3494 * get that intercept, this function will be called again though and 3495 * we'll get the vintr intercept. However, if the vGIF feature is 3496 * enabled, the STGI interception will not occur. Enable the irq 3497 * window under the assumption that the hardware will set the GIF. 3498 */ 3499 if (vgif_enabled(svm) || gif_set(svm)) { 3500 /* 3501 * IRQ window is not needed when AVIC is enabled, 3502 * unless we have pending ExtINT since it cannot be injected 3503 * via AVIC. In such case, we need to temporarily disable AVIC, 3504 * and fallback to injecting IRQ via V_IRQ. 3505 */ 3506 kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN); 3507 svm_set_vintr(svm); 3508 } 3509 } 3510 3511 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) 3512 { 3513 struct vcpu_svm *svm = to_svm(vcpu); 3514 3515 if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) 3516 return; /* IRET will cause a vm exit */ 3517 3518 if (!gif_set(svm)) { 3519 if (vgif_enabled(svm)) 3520 svm_set_intercept(svm, INTERCEPT_STGI); 3521 return; /* STGI will cause a vm exit */ 3522 } 3523 3524 /* 3525 * Something prevents NMI from been injected. Single step over possible 3526 * problem (IRET or exception injection or interrupt shadow) 3527 */ 3528 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); 3529 svm->nmi_singlestep = true; 3530 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3531 } 3532 3533 static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) 3534 { 3535 struct vcpu_svm *svm = to_svm(vcpu); 3536 3537 /* 3538 * Flush only the current ASID even if the TLB flush was invoked via 3539 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all 3540 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and 3541 * unconditionally does a TLB flush on both nested VM-Enter and nested 3542 * VM-Exit (via kvm_mmu_reset_context()). 3543 */ 3544 if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) 3545 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 3546 else 3547 svm->current_vmcb->asid_generation--; 3548 } 3549 3550 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) 3551 { 3552 struct vcpu_svm *svm = to_svm(vcpu); 3553 3554 invlpga(gva, svm->vmcb->control.asid); 3555 } 3556 3557 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) 3558 { 3559 struct vcpu_svm *svm = to_svm(vcpu); 3560 3561 if (nested_svm_virtualize_tpr(vcpu)) 3562 return; 3563 3564 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { 3565 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3566 kvm_set_cr8(vcpu, cr8); 3567 } 3568 } 3569 3570 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) 3571 { 3572 struct vcpu_svm *svm = to_svm(vcpu); 3573 u64 cr8; 3574 3575 if (nested_svm_virtualize_tpr(vcpu) || 3576 kvm_vcpu_apicv_active(vcpu)) 3577 return; 3578 3579 cr8 = kvm_get_cr8(vcpu); 3580 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 3581 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 3582 } 3583 3584 static void svm_complete_interrupts(struct kvm_vcpu *vcpu) 3585 { 3586 struct vcpu_svm *svm = to_svm(vcpu); 3587 u8 vector; 3588 int type; 3589 u32 exitintinfo = svm->vmcb->control.exit_int_info; 3590 unsigned int3_injected = svm->int3_injected; 3591 3592 svm->int3_injected = 0; 3593 3594 /* 3595 * If we've made progress since setting HF_IRET_MASK, we've 3596 * executed an IRET and can allow NMI injection. 3597 */ 3598 if ((vcpu->arch.hflags & HF_IRET_MASK) && 3599 (sev_es_guest(vcpu->kvm) || 3600 kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { 3601 vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3602 kvm_make_request(KVM_REQ_EVENT, vcpu); 3603 } 3604 3605 vcpu->arch.nmi_injected = false; 3606 kvm_clear_exception_queue(vcpu); 3607 kvm_clear_interrupt_queue(vcpu); 3608 3609 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 3610 return; 3611 3612 kvm_make_request(KVM_REQ_EVENT, vcpu); 3613 3614 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 3615 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 3616 3617 switch (type) { 3618 case SVM_EXITINTINFO_TYPE_NMI: 3619 vcpu->arch.nmi_injected = true; 3620 break; 3621 case SVM_EXITINTINFO_TYPE_EXEPT: 3622 /* 3623 * Never re-inject a #VC exception. 3624 */ 3625 if (vector == X86_TRAP_VC) 3626 break; 3627 3628 /* 3629 * In case of software exceptions, do not reinject the vector, 3630 * but re-execute the instruction instead. Rewind RIP first 3631 * if we emulated INT3 before. 3632 */ 3633 if (kvm_exception_is_soft(vector)) { 3634 if (vector == BP_VECTOR && int3_injected && 3635 kvm_is_linear_rip(vcpu, svm->int3_rip)) 3636 kvm_rip_write(vcpu, 3637 kvm_rip_read(vcpu) - int3_injected); 3638 break; 3639 } 3640 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 3641 u32 err = svm->vmcb->control.exit_int_info_err; 3642 kvm_requeue_exception_e(vcpu, vector, err); 3643 3644 } else 3645 kvm_requeue_exception(vcpu, vector); 3646 break; 3647 case SVM_EXITINTINFO_TYPE_INTR: 3648 kvm_queue_interrupt(vcpu, vector, false); 3649 break; 3650 default: 3651 break; 3652 } 3653 } 3654 3655 static void svm_cancel_injection(struct kvm_vcpu *vcpu) 3656 { 3657 struct vcpu_svm *svm = to_svm(vcpu); 3658 struct vmcb_control_area *control = &svm->vmcb->control; 3659 3660 control->exit_int_info = control->event_inj; 3661 control->exit_int_info_err = control->event_inj_err; 3662 control->event_inj = 0; 3663 svm_complete_interrupts(vcpu); 3664 } 3665 3666 static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) 3667 { 3668 return 1; 3669 } 3670 3671 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 3672 { 3673 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && 3674 to_svm(vcpu)->vmcb->control.exit_info_1) 3675 return handle_fastpath_set_msr_irqoff(vcpu); 3676 3677 return EXIT_FASTPATH_NONE; 3678 } 3679 3680 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) 3681 { 3682 struct vcpu_svm *svm = to_svm(vcpu); 3683 unsigned long vmcb_pa = svm->current_vmcb->pa; 3684 3685 guest_state_enter_irqoff(); 3686 3687 if (sev_es_guest(vcpu->kvm)) { 3688 __svm_sev_es_vcpu_run(vmcb_pa); 3689 } else { 3690 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3691 3692 /* 3693 * Use a single vmcb (vmcb01 because it's always valid) for 3694 * context switching guest state via VMLOAD/VMSAVE, that way 3695 * the state doesn't need to be copied between vmcb01 and 3696 * vmcb02 when switching vmcbs for nested virtualization. 3697 */ 3698 vmload(svm->vmcb01.pa); 3699 __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs); 3700 vmsave(svm->vmcb01.pa); 3701 3702 vmload(__sme_page_pa(sd->save_area)); 3703 } 3704 3705 guest_state_exit_irqoff(); 3706 } 3707 3708 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) 3709 { 3710 struct vcpu_svm *svm = to_svm(vcpu); 3711 3712 trace_kvm_entry(vcpu); 3713 3714 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3715 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3716 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 3717 3718 /* 3719 * Disable singlestep if we're injecting an interrupt/exception. 3720 * We don't want our modified rflags to be pushed on the stack where 3721 * we might not be able to easily reset them if we disabled NMI 3722 * singlestep later. 3723 */ 3724 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { 3725 /* 3726 * Event injection happens before external interrupts cause a 3727 * vmexit and interrupts are disabled here, so smp_send_reschedule 3728 * is enough to force an immediate vmexit. 3729 */ 3730 disable_nmi_singlestep(svm); 3731 smp_send_reschedule(vcpu->cpu); 3732 } 3733 3734 pre_svm_run(vcpu); 3735 3736 sync_lapic_to_cr8(vcpu); 3737 3738 if (unlikely(svm->asid != svm->vmcb->control.asid)) { 3739 svm->vmcb->control.asid = svm->asid; 3740 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 3741 } 3742 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3743 3744 svm_hv_update_vp_id(svm->vmcb, vcpu); 3745 3746 /* 3747 * Run with all-zero DR6 unless needed, so that we can get the exact cause 3748 * of a #DB. 3749 */ 3750 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) 3751 svm_set_dr6(svm, vcpu->arch.dr6); 3752 else 3753 svm_set_dr6(svm, DR6_ACTIVE_LOW); 3754 3755 clgi(); 3756 kvm_load_guest_xsave_state(vcpu); 3757 3758 kvm_wait_lapic_expire(vcpu); 3759 3760 /* 3761 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 3762 * it's non-zero. Since vmentry is serialising on affected CPUs, there 3763 * is no need to worry about the conditional branch over the wrmsr 3764 * being speculatively taken. 3765 */ 3766 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 3767 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); 3768 3769 svm_vcpu_enter_exit(vcpu); 3770 3771 /* 3772 * We do not use IBRS in the kernel. If this vCPU has used the 3773 * SPEC_CTRL MSR it may have left it on; save the value and 3774 * turn it off. This is much more efficient than blindly adding 3775 * it to the atomic save/restore list. Especially as the former 3776 * (Saving guest MSRs on vmexit) doesn't even exist in KVM. 3777 * 3778 * For non-nested case: 3779 * If the L01 MSR bitmap does not intercept the MSR, then we need to 3780 * save it. 3781 * 3782 * For nested case: 3783 * If the L02 MSR bitmap does not intercept the MSR, then we need to 3784 * save it. 3785 */ 3786 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) && 3787 unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) 3788 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); 3789 3790 if (!sev_es_guest(vcpu->kvm)) 3791 reload_tss(vcpu); 3792 3793 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 3794 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); 3795 3796 if (!sev_es_guest(vcpu->kvm)) { 3797 vcpu->arch.cr2 = svm->vmcb->save.cr2; 3798 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 3799 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3800 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3801 } 3802 vcpu->arch.regs_dirty = 0; 3803 3804 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 3805 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 3806 3807 kvm_load_host_xsave_state(vcpu); 3808 stgi(); 3809 3810 /* Any pending NMI will happen here */ 3811 3812 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 3813 kvm_after_interrupt(vcpu); 3814 3815 sync_cr8_to_lapic(vcpu); 3816 3817 svm->next_rip = 0; 3818 if (is_guest_mode(vcpu)) { 3819 nested_sync_control_from_vmcb02(svm); 3820 3821 /* Track VMRUNs that have made past consistency checking */ 3822 if (svm->nested.nested_run_pending && 3823 svm->vmcb->control.exit_code != SVM_EXIT_ERR) 3824 ++vcpu->stat.nested_run; 3825 3826 svm->nested.nested_run_pending = 0; 3827 } 3828 3829 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 3830 vmcb_mark_all_clean(svm->vmcb); 3831 3832 /* if exit due to PF check for async PF */ 3833 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) 3834 vcpu->arch.apf.host_apf_flags = 3835 kvm_read_and_reset_apf_flags(); 3836 3837 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; 3838 3839 /* 3840 * We need to handle MC intercepts here before the vcpu has a chance to 3841 * change the physical cpu 3842 */ 3843 if (unlikely(svm->vmcb->control.exit_code == 3844 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 3845 svm_handle_mce(vcpu); 3846 3847 svm_complete_interrupts(vcpu); 3848 3849 if (is_guest_mode(vcpu)) 3850 return EXIT_FASTPATH_NONE; 3851 3852 return svm_exit_handlers_fastpath(vcpu); 3853 } 3854 3855 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, 3856 int root_level) 3857 { 3858 struct vcpu_svm *svm = to_svm(vcpu); 3859 unsigned long cr3; 3860 3861 if (npt_enabled) { 3862 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); 3863 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 3864 3865 hv_track_root_tdp(vcpu, root_hpa); 3866 3867 cr3 = vcpu->arch.cr3; 3868 } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { 3869 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); 3870 } else { 3871 /* PCID in the guest should be impossible with a 32-bit MMU. */ 3872 WARN_ON_ONCE(kvm_get_active_pcid(vcpu)); 3873 cr3 = root_hpa; 3874 } 3875 3876 svm->vmcb->save.cr3 = cr3; 3877 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 3878 } 3879 3880 static int is_disabled(void) 3881 { 3882 u64 vm_cr; 3883 3884 rdmsrl(MSR_VM_CR, vm_cr); 3885 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) 3886 return 1; 3887 3888 return 0; 3889 } 3890 3891 static void 3892 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 3893 { 3894 /* 3895 * Patch in the VMMCALL instruction: 3896 */ 3897 hypercall[0] = 0x0f; 3898 hypercall[1] = 0x01; 3899 hypercall[2] = 0xd9; 3900 } 3901 3902 static int __init svm_check_processor_compat(void) 3903 { 3904 return 0; 3905 } 3906 3907 /* 3908 * The kvm parameter can be NULL (module initialization, or invocation before 3909 * VM creation). Be sure to check the kvm parameter before using it. 3910 */ 3911 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) 3912 { 3913 switch (index) { 3914 case MSR_IA32_MCG_EXT_CTL: 3915 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 3916 return false; 3917 case MSR_IA32_SMBASE: 3918 /* SEV-ES guests do not support SMM, so report false */ 3919 if (kvm && sev_es_guest(kvm)) 3920 return false; 3921 break; 3922 default: 3923 break; 3924 } 3925 3926 return true; 3927 } 3928 3929 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 3930 { 3931 return 0; 3932 } 3933 3934 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 3935 { 3936 struct vcpu_svm *svm = to_svm(vcpu); 3937 struct kvm_cpuid_entry2 *best; 3938 struct kvm *kvm = vcpu->kvm; 3939 3940 vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 3941 boot_cpu_has(X86_FEATURE_XSAVE) && 3942 boot_cpu_has(X86_FEATURE_XSAVES); 3943 3944 /* Update nrips enabled cache */ 3945 svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && 3946 guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); 3947 3948 svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); 3949 3950 svm_recalc_instruction_intercepts(vcpu, svm); 3951 3952 /* For sev guests, the memory encryption bit is not reserved in CR3. */ 3953 if (sev_guest(vcpu->kvm)) { 3954 best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); 3955 if (best) 3956 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); 3957 } 3958 3959 if (kvm_vcpu_apicv_active(vcpu)) { 3960 /* 3961 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature 3962 * is exposed to the guest, disable AVIC. 3963 */ 3964 if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) 3965 kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC); 3966 3967 /* 3968 * Currently, AVIC does not work with nested virtualization. 3969 * So, we disable AVIC when cpuid for SVM is set in the L1 guest. 3970 */ 3971 if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 3972 kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED); 3973 } 3974 init_vmcb_after_set_cpuid(vcpu); 3975 } 3976 3977 static bool svm_has_wbinvd_exit(void) 3978 { 3979 return true; 3980 } 3981 3982 #define PRE_EX(exit) { .exit_code = (exit), \ 3983 .stage = X86_ICPT_PRE_EXCEPT, } 3984 #define POST_EX(exit) { .exit_code = (exit), \ 3985 .stage = X86_ICPT_POST_EXCEPT, } 3986 #define POST_MEM(exit) { .exit_code = (exit), \ 3987 .stage = X86_ICPT_POST_MEMACCESS, } 3988 3989 static const struct __x86_intercept { 3990 u32 exit_code; 3991 enum x86_intercept_stage stage; 3992 } x86_intercept_map[] = { 3993 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), 3994 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), 3995 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), 3996 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), 3997 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), 3998 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), 3999 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), 4000 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), 4001 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), 4002 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), 4003 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), 4004 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), 4005 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), 4006 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), 4007 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), 4008 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), 4009 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), 4010 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), 4011 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), 4012 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), 4013 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), 4014 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), 4015 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), 4016 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), 4017 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), 4018 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), 4019 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), 4020 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), 4021 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), 4022 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), 4023 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), 4024 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), 4025 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), 4026 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), 4027 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), 4028 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), 4029 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), 4030 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), 4031 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), 4032 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), 4033 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), 4034 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), 4035 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), 4036 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), 4037 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), 4038 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), 4039 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), 4040 }; 4041 4042 #undef PRE_EX 4043 #undef POST_EX 4044 #undef POST_MEM 4045 4046 static int svm_check_intercept(struct kvm_vcpu *vcpu, 4047 struct x86_instruction_info *info, 4048 enum x86_intercept_stage stage, 4049 struct x86_exception *exception) 4050 { 4051 struct vcpu_svm *svm = to_svm(vcpu); 4052 int vmexit, ret = X86EMUL_CONTINUE; 4053 struct __x86_intercept icpt_info; 4054 struct vmcb *vmcb = svm->vmcb; 4055 4056 if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) 4057 goto out; 4058 4059 icpt_info = x86_intercept_map[info->intercept]; 4060 4061 if (stage != icpt_info.stage) 4062 goto out; 4063 4064 switch (icpt_info.exit_code) { 4065 case SVM_EXIT_READ_CR0: 4066 if (info->intercept == x86_intercept_cr_read) 4067 icpt_info.exit_code += info->modrm_reg; 4068 break; 4069 case SVM_EXIT_WRITE_CR0: { 4070 unsigned long cr0, val; 4071 4072 if (info->intercept == x86_intercept_cr_write) 4073 icpt_info.exit_code += info->modrm_reg; 4074 4075 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || 4076 info->intercept == x86_intercept_clts) 4077 break; 4078 4079 if (!(vmcb12_is_intercept(&svm->nested.ctl, 4080 INTERCEPT_SELECTIVE_CR0))) 4081 break; 4082 4083 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; 4084 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; 4085 4086 if (info->intercept == x86_intercept_lmsw) { 4087 cr0 &= 0xfUL; 4088 val &= 0xfUL; 4089 /* lmsw can't clear PE - catch this here */ 4090 if (cr0 & X86_CR0_PE) 4091 val |= X86_CR0_PE; 4092 } 4093 4094 if (cr0 ^ val) 4095 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; 4096 4097 break; 4098 } 4099 case SVM_EXIT_READ_DR0: 4100 case SVM_EXIT_WRITE_DR0: 4101 icpt_info.exit_code += info->modrm_reg; 4102 break; 4103 case SVM_EXIT_MSR: 4104 if (info->intercept == x86_intercept_wrmsr) 4105 vmcb->control.exit_info_1 = 1; 4106 else 4107 vmcb->control.exit_info_1 = 0; 4108 break; 4109 case SVM_EXIT_PAUSE: 4110 /* 4111 * We get this for NOP only, but pause 4112 * is rep not, check this here 4113 */ 4114 if (info->rep_prefix != REPE_PREFIX) 4115 goto out; 4116 break; 4117 case SVM_EXIT_IOIO: { 4118 u64 exit_info; 4119 u32 bytes; 4120 4121 if (info->intercept == x86_intercept_in || 4122 info->intercept == x86_intercept_ins) { 4123 exit_info = ((info->src_val & 0xffff) << 16) | 4124 SVM_IOIO_TYPE_MASK; 4125 bytes = info->dst_bytes; 4126 } else { 4127 exit_info = (info->dst_val & 0xffff) << 16; 4128 bytes = info->src_bytes; 4129 } 4130 4131 if (info->intercept == x86_intercept_outs || 4132 info->intercept == x86_intercept_ins) 4133 exit_info |= SVM_IOIO_STR_MASK; 4134 4135 if (info->rep_prefix) 4136 exit_info |= SVM_IOIO_REP_MASK; 4137 4138 bytes = min(bytes, 4u); 4139 4140 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; 4141 4142 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); 4143 4144 vmcb->control.exit_info_1 = exit_info; 4145 vmcb->control.exit_info_2 = info->next_rip; 4146 4147 break; 4148 } 4149 default: 4150 break; 4151 } 4152 4153 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */ 4154 if (static_cpu_has(X86_FEATURE_NRIPS)) 4155 vmcb->control.next_rip = info->next_rip; 4156 vmcb->control.exit_code = icpt_info.exit_code; 4157 vmexit = nested_svm_exit_handled(svm); 4158 4159 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED 4160 : X86EMUL_CONTINUE; 4161 4162 out: 4163 return ret; 4164 } 4165 4166 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) 4167 { 4168 } 4169 4170 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) 4171 { 4172 if (!kvm_pause_in_guest(vcpu->kvm)) 4173 shrink_ple_window(vcpu); 4174 } 4175 4176 static void svm_setup_mce(struct kvm_vcpu *vcpu) 4177 { 4178 /* [63:9] are reserved. */ 4179 vcpu->arch.mcg_cap &= 0x1ff; 4180 } 4181 4182 bool svm_smi_blocked(struct kvm_vcpu *vcpu) 4183 { 4184 struct vcpu_svm *svm = to_svm(vcpu); 4185 4186 /* Per APM Vol.2 15.22.2 "Response to SMI" */ 4187 if (!gif_set(svm)) 4188 return true; 4189 4190 return is_smm(vcpu); 4191 } 4192 4193 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4194 { 4195 struct vcpu_svm *svm = to_svm(vcpu); 4196 if (svm->nested.nested_run_pending) 4197 return -EBUSY; 4198 4199 if (svm_smi_blocked(vcpu)) 4200 return 0; 4201 4202 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ 4203 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) 4204 return -EBUSY; 4205 4206 return 1; 4207 } 4208 4209 static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) 4210 { 4211 struct vcpu_svm *svm = to_svm(vcpu); 4212 struct kvm_host_map map_save; 4213 int ret; 4214 4215 if (!is_guest_mode(vcpu)) 4216 return 0; 4217 4218 /* FED8h - SVM Guest */ 4219 put_smstate(u64, smstate, 0x7ed8, 1); 4220 /* FEE0h - SVM Guest VMCB Physical Address */ 4221 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); 4222 4223 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4224 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4225 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4226 4227 ret = nested_svm_vmexit(svm); 4228 if (ret) 4229 return ret; 4230 4231 /* 4232 * KVM uses VMCB01 to store L1 host state while L2 runs but 4233 * VMCB01 is going to be used during SMM and thus the state will 4234 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save 4235 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the 4236 * format of the area is identical to guest save area offsetted 4237 * by 0x400 (matches the offset of 'struct vmcb_save_area' 4238 * within 'struct vmcb'). Note: HSAVE area may also be used by 4239 * L1 hypervisor to save additional host context (e.g. KVM does 4240 * that, see svm_prepare_switch_to_guest()) which must be 4241 * preserved. 4242 */ 4243 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), 4244 &map_save) == -EINVAL) 4245 return 1; 4246 4247 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); 4248 4249 svm_copy_vmrun_state(map_save.hva + 0x400, 4250 &svm->vmcb01.ptr->save); 4251 4252 kvm_vcpu_unmap(vcpu, &map_save, true); 4253 return 0; 4254 } 4255 4256 static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) 4257 { 4258 struct vcpu_svm *svm = to_svm(vcpu); 4259 struct kvm_host_map map, map_save; 4260 u64 saved_efer, vmcb12_gpa; 4261 struct vmcb *vmcb12; 4262 int ret; 4263 4264 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) 4265 return 0; 4266 4267 /* Non-zero if SMI arrived while vCPU was in guest mode. */ 4268 if (!GET_SMSTATE(u64, smstate, 0x7ed8)) 4269 return 0; 4270 4271 if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 4272 return 1; 4273 4274 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); 4275 if (!(saved_efer & EFER_SVME)) 4276 return 1; 4277 4278 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); 4279 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) 4280 return 1; 4281 4282 ret = 1; 4283 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) 4284 goto unmap_map; 4285 4286 if (svm_allocate_nested(svm)) 4287 goto unmap_save; 4288 4289 /* 4290 * Restore L1 host state from L1 HSAVE area as VMCB01 was 4291 * used during SMM (see svm_enter_smm()) 4292 */ 4293 4294 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); 4295 4296 /* 4297 * Enter the nested guest now 4298 */ 4299 4300 vmcb_mark_all_dirty(svm->vmcb01.ptr); 4301 4302 vmcb12 = map.hva; 4303 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); 4304 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); 4305 ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); 4306 4307 if (ret) 4308 goto unmap_save; 4309 4310 svm->nested.nested_run_pending = 1; 4311 4312 unmap_save: 4313 kvm_vcpu_unmap(vcpu, &map_save, true); 4314 unmap_map: 4315 kvm_vcpu_unmap(vcpu, &map, true); 4316 return ret; 4317 } 4318 4319 static void svm_enable_smi_window(struct kvm_vcpu *vcpu) 4320 { 4321 struct vcpu_svm *svm = to_svm(vcpu); 4322 4323 if (!gif_set(svm)) { 4324 if (vgif_enabled(svm)) 4325 svm_set_intercept(svm, INTERCEPT_STGI); 4326 /* STGI will cause a vm exit */ 4327 } else { 4328 /* We must be in SMM; RSM will cause a vmexit anyway. */ 4329 } 4330 } 4331 4332 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, 4333 void *insn, int insn_len) 4334 { 4335 bool smep, smap, is_user; 4336 unsigned long cr4; 4337 u64 error_code; 4338 4339 /* Emulation is always possible when KVM has access to all guest state. */ 4340 if (!sev_guest(vcpu->kvm)) 4341 return true; 4342 4343 /* #UD and #GP should never be intercepted for SEV guests. */ 4344 WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | 4345 EMULTYPE_TRAP_UD_FORCED | 4346 EMULTYPE_VMWARE_GP)); 4347 4348 /* 4349 * Emulation is impossible for SEV-ES guests as KVM doesn't have access 4350 * to guest register state. 4351 */ 4352 if (sev_es_guest(vcpu->kvm)) 4353 return false; 4354 4355 /* 4356 * Emulation is possible if the instruction is already decoded, e.g. 4357 * when completing I/O after returning from userspace. 4358 */ 4359 if (emul_type & EMULTYPE_NO_DECODE) 4360 return true; 4361 4362 /* 4363 * Emulation is possible for SEV guests if and only if a prefilled 4364 * buffer containing the bytes of the intercepted instruction is 4365 * available. SEV guest memory is encrypted with a guest specific key 4366 * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and 4367 * decode garbage. 4368 * 4369 * Inject #UD if KVM reached this point without an instruction buffer. 4370 * In practice, this path should never be hit by a well-behaved guest, 4371 * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path 4372 * is still theoretically reachable, e.g. via unaccelerated fault-like 4373 * AVIC access, and needs to be handled by KVM to avoid putting the 4374 * guest into an infinite loop. Injecting #UD is somewhat arbitrary, 4375 * but its the least awful option given lack of insight into the guest. 4376 */ 4377 if (unlikely(!insn)) { 4378 kvm_queue_exception(vcpu, UD_VECTOR); 4379 return false; 4380 } 4381 4382 /* 4383 * Emulate for SEV guests if the insn buffer is not empty. The buffer 4384 * will be empty if the DecodeAssist microcode cannot fetch bytes for 4385 * the faulting instruction because the code fetch itself faulted, e.g. 4386 * the guest attempted to fetch from emulated MMIO or a guest page 4387 * table used to translate CS:RIP resides in emulated MMIO. 4388 */ 4389 if (likely(insn_len)) 4390 return true; 4391 4392 /* 4393 * Detect and workaround Errata 1096 Fam_17h_00_0Fh. 4394 * 4395 * Errata: 4396 * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is 4397 * possible that CPU microcode implementing DecodeAssist will fail to 4398 * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly 4399 * be '0'. This happens because microcode reads CS:RIP using a _data_ 4400 * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode 4401 * gives up and does not fill the instruction bytes buffer. 4402 * 4403 * As above, KVM reaches this point iff the VM is an SEV guest, the CPU 4404 * supports DecodeAssist, a #NPF was raised, KVM's page fault handler 4405 * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the 4406 * GuestIntrBytes field of the VMCB. 4407 * 4408 * This does _not_ mean that the erratum has been encountered, as the 4409 * DecodeAssist will also fail if the load for CS:RIP hits a legitimate 4410 * #PF, e.g. if the guest attempt to execute from emulated MMIO and 4411 * encountered a reserved/not-present #PF. 4412 * 4413 * To hit the erratum, the following conditions must be true: 4414 * 1. CR4.SMAP=1 (obviously). 4415 * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot 4416 * have been hit as the guest would have encountered a SMEP 4417 * violation #PF, not a #NPF. 4418 * 3. The #NPF is not due to a code fetch, in which case failure to 4419 * retrieve the instruction bytes is legitimate (see abvoe). 4420 * 4421 * In addition, don't apply the erratum workaround if the #NPF occurred 4422 * while translating guest page tables (see below). 4423 */ 4424 error_code = to_svm(vcpu)->vmcb->control.exit_info_1; 4425 if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) 4426 goto resume_guest; 4427 4428 cr4 = kvm_read_cr4(vcpu); 4429 smep = cr4 & X86_CR4_SMEP; 4430 smap = cr4 & X86_CR4_SMAP; 4431 is_user = svm_get_cpl(vcpu) == 3; 4432 if (smap && (!smep || is_user)) { 4433 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); 4434 4435 /* 4436 * If the fault occurred in userspace, arbitrarily inject #GP 4437 * to avoid killing the guest and to hopefully avoid confusing 4438 * the guest kernel too much, e.g. injecting #PF would not be 4439 * coherent with respect to the guest's page tables. Request 4440 * triple fault if the fault occurred in the kernel as there's 4441 * no fault that KVM can inject without confusing the guest. 4442 * In practice, the triple fault is moot as no sane SEV kernel 4443 * will execute from user memory while also running with SMAP=1. 4444 */ 4445 if (is_user) 4446 kvm_inject_gp(vcpu, 0); 4447 else 4448 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4449 } 4450 4451 resume_guest: 4452 /* 4453 * If the erratum was not hit, simply resume the guest and let it fault 4454 * again. While awful, e.g. the vCPU may get stuck in an infinite loop 4455 * if the fault is at CPL=0, it's the lesser of all evils. Exiting to 4456 * userspace will kill the guest, and letting the emulator read garbage 4457 * will yield random behavior and potentially corrupt the guest. 4458 * 4459 * Simply resuming the guest is technically not a violation of the SEV 4460 * architecture. AMD's APM states that all code fetches and page table 4461 * accesses for SEV guest are encrypted, regardless of the C-Bit. The 4462 * APM also states that encrypted accesses to MMIO are "ignored", but 4463 * doesn't explicitly define "ignored", i.e. doing nothing and letting 4464 * the guest spin is technically "ignoring" the access. 4465 */ 4466 return false; 4467 } 4468 4469 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 4470 { 4471 struct vcpu_svm *svm = to_svm(vcpu); 4472 4473 /* 4474 * TODO: Last condition latch INIT signals on vCPU when 4475 * vCPU is in guest-mode and vmcb12 defines intercept on INIT. 4476 * To properly emulate the INIT intercept, 4477 * svm_check_nested_events() should call nested_svm_vmexit() 4478 * if an INIT signal is pending. 4479 */ 4480 return !gif_set(svm) || 4481 (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); 4482 } 4483 4484 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 4485 { 4486 if (!sev_es_guest(vcpu->kvm)) 4487 return kvm_vcpu_deliver_sipi_vector(vcpu, vector); 4488 4489 sev_vcpu_deliver_sipi_vector(vcpu, vector); 4490 } 4491 4492 static void svm_vm_destroy(struct kvm *kvm) 4493 { 4494 avic_vm_destroy(kvm); 4495 sev_vm_destroy(kvm); 4496 } 4497 4498 static int svm_vm_init(struct kvm *kvm) 4499 { 4500 if (!pause_filter_count || !pause_filter_thresh) 4501 kvm->arch.pause_in_guest = true; 4502 4503 if (enable_apicv) { 4504 int ret = avic_vm_init(kvm); 4505 if (ret) 4506 return ret; 4507 } 4508 4509 return 0; 4510 } 4511 4512 static struct kvm_x86_ops svm_x86_ops __initdata = { 4513 .name = "kvm_amd", 4514 4515 .hardware_unsetup = svm_hardware_unsetup, 4516 .hardware_enable = svm_hardware_enable, 4517 .hardware_disable = svm_hardware_disable, 4518 .has_emulated_msr = svm_has_emulated_msr, 4519 4520 .vcpu_create = svm_vcpu_create, 4521 .vcpu_free = svm_vcpu_free, 4522 .vcpu_reset = svm_vcpu_reset, 4523 4524 .vm_size = sizeof(struct kvm_svm), 4525 .vm_init = svm_vm_init, 4526 .vm_destroy = svm_vm_destroy, 4527 4528 .prepare_switch_to_guest = svm_prepare_switch_to_guest, 4529 .vcpu_load = svm_vcpu_load, 4530 .vcpu_put = svm_vcpu_put, 4531 .vcpu_blocking = avic_vcpu_blocking, 4532 .vcpu_unblocking = avic_vcpu_unblocking, 4533 4534 .update_exception_bitmap = svm_update_exception_bitmap, 4535 .get_msr_feature = svm_get_msr_feature, 4536 .get_msr = svm_get_msr, 4537 .set_msr = svm_set_msr, 4538 .get_segment_base = svm_get_segment_base, 4539 .get_segment = svm_get_segment, 4540 .set_segment = svm_set_segment, 4541 .get_cpl = svm_get_cpl, 4542 .get_cs_db_l_bits = svm_get_cs_db_l_bits, 4543 .set_cr0 = svm_set_cr0, 4544 .post_set_cr3 = sev_post_set_cr3, 4545 .is_valid_cr4 = svm_is_valid_cr4, 4546 .set_cr4 = svm_set_cr4, 4547 .set_efer = svm_set_efer, 4548 .get_idt = svm_get_idt, 4549 .set_idt = svm_set_idt, 4550 .get_gdt = svm_get_gdt, 4551 .set_gdt = svm_set_gdt, 4552 .set_dr7 = svm_set_dr7, 4553 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, 4554 .cache_reg = svm_cache_reg, 4555 .get_rflags = svm_get_rflags, 4556 .set_rflags = svm_set_rflags, 4557 .get_if_flag = svm_get_if_flag, 4558 4559 .flush_tlb_all = svm_flush_tlb_current, 4560 .flush_tlb_current = svm_flush_tlb_current, 4561 .flush_tlb_gva = svm_flush_tlb_gva, 4562 .flush_tlb_guest = svm_flush_tlb_current, 4563 4564 .vcpu_pre_run = svm_vcpu_pre_run, 4565 .vcpu_run = svm_vcpu_run, 4566 .handle_exit = svm_handle_exit, 4567 .skip_emulated_instruction = svm_skip_emulated_instruction, 4568 .update_emulated_instruction = NULL, 4569 .set_interrupt_shadow = svm_set_interrupt_shadow, 4570 .get_interrupt_shadow = svm_get_interrupt_shadow, 4571 .patch_hypercall = svm_patch_hypercall, 4572 .inject_irq = svm_inject_irq, 4573 .inject_nmi = svm_inject_nmi, 4574 .queue_exception = svm_queue_exception, 4575 .cancel_injection = svm_cancel_injection, 4576 .interrupt_allowed = svm_interrupt_allowed, 4577 .nmi_allowed = svm_nmi_allowed, 4578 .get_nmi_mask = svm_get_nmi_mask, 4579 .set_nmi_mask = svm_set_nmi_mask, 4580 .enable_nmi_window = svm_enable_nmi_window, 4581 .enable_irq_window = svm_enable_irq_window, 4582 .update_cr8_intercept = svm_update_cr8_intercept, 4583 .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, 4584 .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons, 4585 .apicv_post_state_restore = avic_apicv_post_state_restore, 4586 4587 .get_mt_mask = svm_get_mt_mask, 4588 .get_exit_info = svm_get_exit_info, 4589 4590 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, 4591 4592 .has_wbinvd_exit = svm_has_wbinvd_exit, 4593 4594 .get_l2_tsc_offset = svm_get_l2_tsc_offset, 4595 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, 4596 .write_tsc_offset = svm_write_tsc_offset, 4597 .write_tsc_multiplier = svm_write_tsc_multiplier, 4598 4599 .load_mmu_pgd = svm_load_mmu_pgd, 4600 4601 .check_intercept = svm_check_intercept, 4602 .handle_exit_irqoff = svm_handle_exit_irqoff, 4603 4604 .request_immediate_exit = __kvm_request_immediate_exit, 4605 4606 .sched_in = svm_sched_in, 4607 4608 .pmu_ops = &amd_pmu_ops, 4609 .nested_ops = &svm_nested_ops, 4610 4611 .deliver_interrupt = svm_deliver_interrupt, 4612 .pi_update_irte = avic_pi_update_irte, 4613 .setup_mce = svm_setup_mce, 4614 4615 .smi_allowed = svm_smi_allowed, 4616 .enter_smm = svm_enter_smm, 4617 .leave_smm = svm_leave_smm, 4618 .enable_smi_window = svm_enable_smi_window, 4619 4620 .mem_enc_ioctl = sev_mem_enc_ioctl, 4621 .mem_enc_register_region = sev_mem_enc_register_region, 4622 .mem_enc_unregister_region = sev_mem_enc_unregister_region, 4623 .guest_memory_reclaimed = sev_guest_memory_reclaimed, 4624 4625 .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, 4626 .vm_move_enc_context_from = sev_vm_move_enc_context_from, 4627 4628 .can_emulate_instruction = svm_can_emulate_instruction, 4629 4630 .apic_init_signal_blocked = svm_apic_init_signal_blocked, 4631 4632 .msr_filter_changed = svm_msr_filter_changed, 4633 .complete_emulated_msr = svm_complete_emulated_msr, 4634 4635 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, 4636 }; 4637 4638 /* 4639 * The default MMIO mask is a single bit (excluding the present bit), 4640 * which could conflict with the memory encryption bit. Check for 4641 * memory encryption support and override the default MMIO mask if 4642 * memory encryption is enabled. 4643 */ 4644 static __init void svm_adjust_mmio_mask(void) 4645 { 4646 unsigned int enc_bit, mask_bit; 4647 u64 msr, mask; 4648 4649 /* If there is no memory encryption support, use existing mask */ 4650 if (cpuid_eax(0x80000000) < 0x8000001f) 4651 return; 4652 4653 /* If memory encryption is not enabled, use existing mask */ 4654 rdmsrl(MSR_AMD64_SYSCFG, msr); 4655 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) 4656 return; 4657 4658 enc_bit = cpuid_ebx(0x8000001f) & 0x3f; 4659 mask_bit = boot_cpu_data.x86_phys_bits; 4660 4661 /* Increment the mask bit if it is the same as the encryption bit */ 4662 if (enc_bit == mask_bit) 4663 mask_bit++; 4664 4665 /* 4666 * If the mask bit location is below 52, then some bits above the 4667 * physical addressing limit will always be reserved, so use the 4668 * rsvd_bits() function to generate the mask. This mask, along with 4669 * the present bit, will be used to generate a page fault with 4670 * PFER.RSV = 1. 4671 * 4672 * If the mask bit location is 52 (or above), then clear the mask. 4673 */ 4674 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; 4675 4676 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); 4677 } 4678 4679 static __init void svm_set_cpu_caps(void) 4680 { 4681 kvm_set_cpu_caps(); 4682 4683 supported_xss = 0; 4684 4685 /* CPUID 0x80000001 and 0x8000000A (SVM features) */ 4686 if (nested) { 4687 kvm_cpu_cap_set(X86_FEATURE_SVM); 4688 kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); 4689 4690 if (nrips) 4691 kvm_cpu_cap_set(X86_FEATURE_NRIPS); 4692 4693 if (npt_enabled) 4694 kvm_cpu_cap_set(X86_FEATURE_NPT); 4695 4696 if (tsc_scaling) 4697 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); 4698 4699 /* Nested VM can receive #VMEXIT instead of triggering #GP */ 4700 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); 4701 } 4702 4703 /* CPUID 0x80000008 */ 4704 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 4705 boot_cpu_has(X86_FEATURE_AMD_SSBD)) 4706 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 4707 4708 /* AMD PMU PERFCTR_CORE CPUID */ 4709 if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) 4710 kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); 4711 4712 /* CPUID 0x8000001F (SME/SEV features) */ 4713 sev_set_cpu_caps(); 4714 } 4715 4716 static __init int svm_hardware_setup(void) 4717 { 4718 int cpu; 4719 struct page *iopm_pages; 4720 void *iopm_va; 4721 int r; 4722 unsigned int order = get_order(IOPM_SIZE); 4723 4724 /* 4725 * NX is required for shadow paging and for NPT if the NX huge pages 4726 * mitigation is enabled. 4727 */ 4728 if (!boot_cpu_has(X86_FEATURE_NX)) { 4729 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 4730 return -EOPNOTSUPP; 4731 } 4732 kvm_enable_efer_bits(EFER_NX); 4733 4734 iopm_pages = alloc_pages(GFP_KERNEL, order); 4735 4736 if (!iopm_pages) 4737 return -ENOMEM; 4738 4739 iopm_va = page_address(iopm_pages); 4740 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 4741 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 4742 4743 init_msrpm_offsets(); 4744 4745 supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); 4746 4747 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 4748 kvm_enable_efer_bits(EFER_FFXSR); 4749 4750 if (tsc_scaling) { 4751 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 4752 tsc_scaling = false; 4753 } else { 4754 pr_info("TSC scaling supported\n"); 4755 kvm_has_tsc_control = true; 4756 } 4757 } 4758 kvm_max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; 4759 kvm_tsc_scaling_ratio_frac_bits = 32; 4760 4761 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); 4762 4763 /* Check for pause filtering support */ 4764 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 4765 pause_filter_count = 0; 4766 pause_filter_thresh = 0; 4767 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 4768 pause_filter_thresh = 0; 4769 } 4770 4771 if (nested) { 4772 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 4773 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 4774 } 4775 4776 /* 4777 * KVM's MMU doesn't support using 2-level paging for itself, and thus 4778 * NPT isn't supported if the host is using 2-level paging since host 4779 * CR4 is unchanged on VMRUN. 4780 */ 4781 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) 4782 npt_enabled = false; 4783 4784 if (!boot_cpu_has(X86_FEATURE_NPT)) 4785 npt_enabled = false; 4786 4787 /* Force VM NPT level equal to the host's paging level */ 4788 kvm_configure_mmu(npt_enabled, get_npt_level(), 4789 get_npt_level(), PG_LEVEL_1G); 4790 pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); 4791 4792 /* Note, SEV setup consumes npt_enabled. */ 4793 sev_hardware_setup(); 4794 4795 svm_hv_hardware_setup(); 4796 4797 svm_adjust_mmio_mask(); 4798 4799 for_each_possible_cpu(cpu) { 4800 r = svm_cpu_init(cpu); 4801 if (r) 4802 goto err; 4803 } 4804 4805 if (nrips) { 4806 if (!boot_cpu_has(X86_FEATURE_NRIPS)) 4807 nrips = false; 4808 } 4809 4810 enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); 4811 4812 if (enable_apicv) { 4813 pr_info("AVIC enabled\n"); 4814 4815 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 4816 } else { 4817 svm_x86_ops.vcpu_blocking = NULL; 4818 svm_x86_ops.vcpu_unblocking = NULL; 4819 } 4820 4821 if (vls) { 4822 if (!npt_enabled || 4823 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 4824 !IS_ENABLED(CONFIG_X86_64)) { 4825 vls = false; 4826 } else { 4827 pr_info("Virtual VMLOAD VMSAVE supported\n"); 4828 } 4829 } 4830 4831 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) 4832 svm_gp_erratum_intercept = false; 4833 4834 if (vgif) { 4835 if (!boot_cpu_has(X86_FEATURE_VGIF)) 4836 vgif = false; 4837 else 4838 pr_info("Virtual GIF supported\n"); 4839 } 4840 4841 if (lbrv) { 4842 if (!boot_cpu_has(X86_FEATURE_LBRV)) 4843 lbrv = false; 4844 else 4845 pr_info("LBR virtualization supported\n"); 4846 } 4847 4848 if (!enable_pmu) 4849 pr_info("PMU virtualization is disabled\n"); 4850 4851 svm_set_cpu_caps(); 4852 4853 /* 4854 * It seems that on AMD processors PTE's accessed bit is 4855 * being set by the CPU hardware before the NPF vmexit. 4856 * This is not expected behaviour and our tests fail because 4857 * of it. 4858 * A workaround here is to disable support for 4859 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 4860 * In this case userspace can know if there is support using 4861 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 4862 * it 4863 * If future AMD CPU models change the behaviour described above, 4864 * this variable can be changed accordingly 4865 */ 4866 allow_smaller_maxphyaddr = !npt_enabled; 4867 4868 return 0; 4869 4870 err: 4871 svm_hardware_unsetup(); 4872 return r; 4873 } 4874 4875 4876 static struct kvm_x86_init_ops svm_init_ops __initdata = { 4877 .cpu_has_kvm_support = has_svm, 4878 .disabled_by_bios = is_disabled, 4879 .hardware_setup = svm_hardware_setup, 4880 .check_processor_compatibility = svm_check_processor_compat, 4881 4882 .runtime_ops = &svm_x86_ops, 4883 }; 4884 4885 static int __init svm_init(void) 4886 { 4887 __unused_size_checks(); 4888 4889 return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), 4890 __alignof__(struct vcpu_svm), THIS_MODULE); 4891 } 4892 4893 static void __exit svm_exit(void) 4894 { 4895 kvm_exit(); 4896 } 4897 4898 module_init(svm_init) 4899 module_exit(svm_exit) 4900