1 #define pr_fmt(fmt) "SVM: " fmt 2 3 #include <linux/kvm_host.h> 4 5 #include "irq.h" 6 #include "mmu.h" 7 #include "kvm_cache_regs.h" 8 #include "x86.h" 9 #include "cpuid.h" 10 #include "pmu.h" 11 12 #include <linux/module.h> 13 #include <linux/mod_devicetable.h> 14 #include <linux/kernel.h> 15 #include <linux/vmalloc.h> 16 #include <linux/highmem.h> 17 #include <linux/amd-iommu.h> 18 #include <linux/sched.h> 19 #include <linux/trace_events.h> 20 #include <linux/slab.h> 21 #include <linux/hashtable.h> 22 #include <linux/objtool.h> 23 #include <linux/psp-sev.h> 24 #include <linux/file.h> 25 #include <linux/pagemap.h> 26 #include <linux/swap.h> 27 #include <linux/rwsem.h> 28 #include <linux/cc_platform.h> 29 30 #include <asm/apic.h> 31 #include <asm/perf_event.h> 32 #include <asm/tlbflush.h> 33 #include <asm/desc.h> 34 #include <asm/debugreg.h> 35 #include <asm/kvm_para.h> 36 #include <asm/irq_remapping.h> 37 #include <asm/spec-ctrl.h> 38 #include <asm/cpu_device_id.h> 39 #include <asm/traps.h> 40 #include <asm/fpu/api.h> 41 42 #include <asm/virtext.h> 43 #include "trace.h" 44 45 #include "svm.h" 46 #include "svm_ops.h" 47 48 #include "kvm_onhyperv.h" 49 #include "svm_onhyperv.h" 50 51 MODULE_AUTHOR("Qumranet"); 52 MODULE_LICENSE("GPL"); 53 54 #ifdef MODULE 55 static const struct x86_cpu_id svm_cpu_id[] = { 56 X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL), 57 {} 58 }; 59 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); 60 #endif 61 62 #define SEG_TYPE_LDT 2 63 #define SEG_TYPE_BUSY_TSS16 3 64 65 #define SVM_FEATURE_LBRV (1 << 1) 66 #define SVM_FEATURE_SVML (1 << 2) 67 #define SVM_FEATURE_TSC_RATE (1 << 4) 68 #define SVM_FEATURE_VMCB_CLEAN (1 << 5) 69 #define SVM_FEATURE_FLUSH_ASID (1 << 6) 70 #define SVM_FEATURE_DECODE_ASSIST (1 << 7) 71 #define SVM_FEATURE_PAUSE_FILTER (1 << 10) 72 73 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 74 75 #define TSC_RATIO_RSVD 0xffffff0000000000ULL 76 #define TSC_RATIO_MIN 0x0000000000000001ULL 77 #define TSC_RATIO_MAX 0x000000ffffffffffULL 78 79 static bool erratum_383_found __read_mostly; 80 81 u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; 82 83 /* 84 * Set osvw_len to higher value when updated Revision Guides 85 * are published and we know what the new status bits are 86 */ 87 static uint64_t osvw_len = 4, osvw_status; 88 89 static DEFINE_PER_CPU(u64, current_tsc_ratio); 90 #define TSC_RATIO_DEFAULT 0x0100000000ULL 91 92 static const struct svm_direct_access_msrs { 93 u32 index; /* Index of the MSR */ 94 bool always; /* True if intercept is initially cleared */ 95 } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { 96 { .index = MSR_STAR, .always = true }, 97 { .index = MSR_IA32_SYSENTER_CS, .always = true }, 98 { .index = MSR_IA32_SYSENTER_EIP, .always = false }, 99 { .index = MSR_IA32_SYSENTER_ESP, .always = false }, 100 #ifdef CONFIG_X86_64 101 { .index = MSR_GS_BASE, .always = true }, 102 { .index = MSR_FS_BASE, .always = true }, 103 { .index = MSR_KERNEL_GS_BASE, .always = true }, 104 { .index = MSR_LSTAR, .always = true }, 105 { .index = MSR_CSTAR, .always = true }, 106 { .index = MSR_SYSCALL_MASK, .always = true }, 107 #endif 108 { .index = MSR_IA32_SPEC_CTRL, .always = false }, 109 { .index = MSR_IA32_PRED_CMD, .always = false }, 110 { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, 111 { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, 112 { .index = MSR_IA32_LASTINTFROMIP, .always = false }, 113 { .index = MSR_IA32_LASTINTTOIP, .always = false }, 114 { .index = MSR_EFER, .always = false }, 115 { .index = MSR_IA32_CR_PAT, .always = false }, 116 { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, 117 { .index = MSR_INVALID, .always = false }, 118 }; 119 120 /* 121 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 122 * pause_filter_count: On processors that support Pause filtering(indicated 123 * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter 124 * count value. On VMRUN this value is loaded into an internal counter. 125 * Each time a pause instruction is executed, this counter is decremented 126 * until it reaches zero at which time a #VMEXIT is generated if pause 127 * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause 128 * Intercept Filtering for more details. 129 * This also indicate if ple logic enabled. 130 * 131 * pause_filter_thresh: In addition, some processor families support advanced 132 * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on 133 * the amount of time a guest is allowed to execute in a pause loop. 134 * In this mode, a 16-bit pause filter threshold field is added in the 135 * VMCB. The threshold value is a cycle count that is used to reset the 136 * pause counter. As with simple pause filtering, VMRUN loads the pause 137 * count value from VMCB into an internal counter. Then, on each pause 138 * instruction the hardware checks the elapsed number of cycles since 139 * the most recent pause instruction against the pause filter threshold. 140 * If the elapsed cycle count is greater than the pause filter threshold, 141 * then the internal pause count is reloaded from the VMCB and execution 142 * continues. If the elapsed cycle count is less than the pause filter 143 * threshold, then the internal pause count is decremented. If the count 144 * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is 145 * triggered. If advanced pause filtering is supported and pause filter 146 * threshold field is set to zero, the filter will operate in the simpler, 147 * count only mode. 148 */ 149 150 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; 151 module_param(pause_filter_thresh, ushort, 0444); 152 153 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; 154 module_param(pause_filter_count, ushort, 0444); 155 156 /* Default doubles per-vcpu window every exit. */ 157 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; 158 module_param(pause_filter_count_grow, ushort, 0444); 159 160 /* Default resets per-vcpu window every exit to pause_filter_count. */ 161 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; 162 module_param(pause_filter_count_shrink, ushort, 0444); 163 164 /* Default is to compute the maximum so we can never overflow. */ 165 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; 166 module_param(pause_filter_count_max, ushort, 0444); 167 168 /* 169 * Use nested page tables by default. Note, NPT may get forced off by 170 * svm_hardware_setup() if it's unsupported by hardware or the host kernel. 171 */ 172 bool npt_enabled = true; 173 module_param_named(npt, npt_enabled, bool, 0444); 174 175 /* allow nested virtualization in KVM/SVM */ 176 static int nested = true; 177 module_param(nested, int, S_IRUGO); 178 179 /* enable/disable Next RIP Save */ 180 static int nrips = true; 181 module_param(nrips, int, 0444); 182 183 /* enable/disable Virtual VMLOAD VMSAVE */ 184 static int vls = true; 185 module_param(vls, int, 0444); 186 187 /* enable/disable Virtual GIF */ 188 static int vgif = true; 189 module_param(vgif, int, 0444); 190 191 /* enable/disable LBR virtualization */ 192 static int lbrv = true; 193 module_param(lbrv, int, 0444); 194 195 /* enable/disable PMU virtualization */ 196 bool pmu = true; 197 module_param(pmu, bool, 0444); 198 199 static int tsc_scaling = true; 200 module_param(tsc_scaling, int, 0444); 201 202 /* 203 * enable / disable AVIC. Because the defaults differ for APICv 204 * support between VMX and SVM we cannot use module_param_named. 205 */ 206 static bool avic; 207 module_param(avic, bool, 0444); 208 209 bool __read_mostly dump_invalid_vmcb; 210 module_param(dump_invalid_vmcb, bool, 0644); 211 212 213 bool intercept_smi = true; 214 module_param(intercept_smi, bool, 0444); 215 216 217 static bool svm_gp_erratum_intercept = true; 218 219 static u8 rsm_ins_bytes[] = "\x0f\xaa"; 220 221 static unsigned long iopm_base; 222 223 struct kvm_ldttss_desc { 224 u16 limit0; 225 u16 base0; 226 unsigned base1:8, type:5, dpl:2, p:1; 227 unsigned limit1:4, zero0:3, g:1, base2:8; 228 u32 base3; 229 u32 zero1; 230 } __attribute__((packed)); 231 232 DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 233 234 /* 235 * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via 236 * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. 237 * 238 * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to 239 * defer the restoration of TSC_AUX until the CPU returns to userspace. 240 */ 241 static int tsc_aux_uret_slot __read_mostly = -1; 242 243 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; 244 245 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) 246 #define MSRS_RANGE_SIZE 2048 247 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) 248 249 u32 svm_msrpm_offset(u32 msr) 250 { 251 u32 offset; 252 int i; 253 254 for (i = 0; i < NUM_MSR_MAPS; i++) { 255 if (msr < msrpm_ranges[i] || 256 msr >= msrpm_ranges[i] + MSRS_IN_RANGE) 257 continue; 258 259 offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ 260 offset += (i * MSRS_RANGE_SIZE); /* add range offset */ 261 262 /* Now we have the u8 offset - but need the u32 offset */ 263 return offset / 4; 264 } 265 266 /* MSR not in any range */ 267 return MSR_INVALID; 268 } 269 270 #define MAX_INST_SIZE 15 271 272 static int get_npt_level(void) 273 { 274 #ifdef CONFIG_X86_64 275 return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; 276 #else 277 return PT32E_ROOT_LEVEL; 278 #endif 279 } 280 281 int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 282 { 283 struct vcpu_svm *svm = to_svm(vcpu); 284 u64 old_efer = vcpu->arch.efer; 285 vcpu->arch.efer = efer; 286 287 if (!npt_enabled) { 288 /* Shadow paging assumes NX to be available. */ 289 efer |= EFER_NX; 290 291 if (!(efer & EFER_LMA)) 292 efer &= ~EFER_LME; 293 } 294 295 if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { 296 if (!(efer & EFER_SVME)) { 297 svm_leave_nested(svm); 298 svm_set_gif(svm, true); 299 /* #GP intercept is still needed for vmware backdoor */ 300 if (!enable_vmware_backdoor) 301 clr_exception_intercept(svm, GP_VECTOR); 302 303 /* 304 * Free the nested guest state, unless we are in SMM. 305 * In this case we will return to the nested guest 306 * as soon as we leave SMM. 307 */ 308 if (!is_smm(vcpu)) 309 svm_free_nested(svm); 310 311 } else { 312 int ret = svm_allocate_nested(svm); 313 314 if (ret) { 315 vcpu->arch.efer = old_efer; 316 return ret; 317 } 318 319 if (svm_gp_erratum_intercept) 320 set_exception_intercept(svm, GP_VECTOR); 321 } 322 } 323 324 svm->vmcb->save.efer = efer | EFER_SVME; 325 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 326 return 0; 327 } 328 329 static int is_external_interrupt(u32 info) 330 { 331 info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; 332 return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); 333 } 334 335 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) 336 { 337 struct vcpu_svm *svm = to_svm(vcpu); 338 u32 ret = 0; 339 340 if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) 341 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; 342 return ret; 343 } 344 345 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 346 { 347 struct vcpu_svm *svm = to_svm(vcpu); 348 349 if (mask == 0) 350 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 351 else 352 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; 353 354 } 355 356 static int skip_emulated_instruction(struct kvm_vcpu *vcpu) 357 { 358 struct vcpu_svm *svm = to_svm(vcpu); 359 360 /* 361 * SEV-ES does not expose the next RIP. The RIP update is controlled by 362 * the type of exit and the #VC handler in the guest. 363 */ 364 if (sev_es_guest(vcpu->kvm)) 365 goto done; 366 367 if (nrips && svm->vmcb->control.next_rip != 0) { 368 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); 369 svm->next_rip = svm->vmcb->control.next_rip; 370 } 371 372 if (!svm->next_rip) { 373 if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) 374 return 0; 375 } else { 376 kvm_rip_write(vcpu, svm->next_rip); 377 } 378 379 done: 380 svm_set_interrupt_shadow(vcpu, 0); 381 382 return 1; 383 } 384 385 static void svm_queue_exception(struct kvm_vcpu *vcpu) 386 { 387 struct vcpu_svm *svm = to_svm(vcpu); 388 unsigned nr = vcpu->arch.exception.nr; 389 bool has_error_code = vcpu->arch.exception.has_error_code; 390 u32 error_code = vcpu->arch.exception.error_code; 391 392 kvm_deliver_exception_payload(vcpu); 393 394 if (nr == BP_VECTOR && !nrips) { 395 unsigned long rip, old_rip = kvm_rip_read(vcpu); 396 397 /* 398 * For guest debugging where we have to reinject #BP if some 399 * INT3 is guest-owned: 400 * Emulate nRIP by moving RIP forward. Will fail if injection 401 * raises a fault that is not intercepted. Still better than 402 * failing in all cases. 403 */ 404 (void)skip_emulated_instruction(vcpu); 405 rip = kvm_rip_read(vcpu); 406 svm->int3_rip = rip + svm->vmcb->save.cs.base; 407 svm->int3_injected = rip - old_rip; 408 } 409 410 svm->vmcb->control.event_inj = nr 411 | SVM_EVTINJ_VALID 412 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) 413 | SVM_EVTINJ_TYPE_EXEPT; 414 svm->vmcb->control.event_inj_err = error_code; 415 } 416 417 static void svm_init_erratum_383(void) 418 { 419 u32 low, high; 420 int err; 421 u64 val; 422 423 if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) 424 return; 425 426 /* Use _safe variants to not break nested virtualization */ 427 val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); 428 if (err) 429 return; 430 431 val |= (1ULL << 47); 432 433 low = lower_32_bits(val); 434 high = upper_32_bits(val); 435 436 native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); 437 438 erratum_383_found = true; 439 } 440 441 static void svm_init_osvw(struct kvm_vcpu *vcpu) 442 { 443 /* 444 * Guests should see errata 400 and 415 as fixed (assuming that 445 * HLT and IO instructions are intercepted). 446 */ 447 vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; 448 vcpu->arch.osvw.status = osvw_status & ~(6ULL); 449 450 /* 451 * By increasing VCPU's osvw.length to 3 we are telling the guest that 452 * all osvw.status bits inside that length, including bit 0 (which is 453 * reserved for erratum 298), are valid. However, if host processor's 454 * osvw_len is 0 then osvw_status[0] carries no information. We need to 455 * be conservative here and therefore we tell the guest that erratum 298 456 * is present (because we really don't know). 457 */ 458 if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) 459 vcpu->arch.osvw.status |= 1; 460 } 461 462 static int has_svm(void) 463 { 464 const char *msg; 465 466 if (!cpu_has_svm(&msg)) { 467 printk(KERN_INFO "has_svm: %s\n", msg); 468 return 0; 469 } 470 471 if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { 472 pr_info("KVM is unsupported when running as an SEV guest\n"); 473 return 0; 474 } 475 476 return 1; 477 } 478 479 static void svm_hardware_disable(void) 480 { 481 /* Make sure we clean up behind us */ 482 if (tsc_scaling) 483 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); 484 485 cpu_svm_disable(); 486 487 amd_pmu_disable_virt(); 488 } 489 490 static int svm_hardware_enable(void) 491 { 492 493 struct svm_cpu_data *sd; 494 uint64_t efer; 495 struct desc_struct *gdt; 496 int me = raw_smp_processor_id(); 497 498 rdmsrl(MSR_EFER, efer); 499 if (efer & EFER_SVME) 500 return -EBUSY; 501 502 if (!has_svm()) { 503 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); 504 return -EINVAL; 505 } 506 sd = per_cpu(svm_data, me); 507 if (!sd) { 508 pr_err("%s: svm_data is NULL on %d\n", __func__, me); 509 return -EINVAL; 510 } 511 512 sd->asid_generation = 1; 513 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 514 sd->next_asid = sd->max_asid + 1; 515 sd->min_asid = max_sev_asid + 1; 516 517 gdt = get_current_gdt_rw(); 518 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 519 520 wrmsrl(MSR_EFER, efer | EFER_SVME); 521 522 wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); 523 524 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { 525 /* 526 * Set the default value, even if we don't use TSC scaling 527 * to avoid having stale value in the msr 528 */ 529 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); 530 __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT); 531 } 532 533 534 /* 535 * Get OSVW bits. 536 * 537 * Note that it is possible to have a system with mixed processor 538 * revisions and therefore different OSVW bits. If bits are not the same 539 * on different processors then choose the worst case (i.e. if erratum 540 * is present on one processor and not on another then assume that the 541 * erratum is present everywhere). 542 */ 543 if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { 544 uint64_t len, status = 0; 545 int err; 546 547 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); 548 if (!err) 549 status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, 550 &err); 551 552 if (err) 553 osvw_status = osvw_len = 0; 554 else { 555 if (len < osvw_len) 556 osvw_len = len; 557 osvw_status |= status; 558 osvw_status &= (1ULL << osvw_len) - 1; 559 } 560 } else 561 osvw_status = osvw_len = 0; 562 563 svm_init_erratum_383(); 564 565 amd_pmu_enable_virt(); 566 567 return 0; 568 } 569 570 static void svm_cpu_uninit(int cpu) 571 { 572 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 573 574 if (!sd) 575 return; 576 577 per_cpu(svm_data, cpu) = NULL; 578 kfree(sd->sev_vmcbs); 579 __free_page(sd->save_area); 580 kfree(sd); 581 } 582 583 static int svm_cpu_init(int cpu) 584 { 585 struct svm_cpu_data *sd; 586 int ret = -ENOMEM; 587 588 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 589 if (!sd) 590 return ret; 591 sd->cpu = cpu; 592 sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); 593 if (!sd->save_area) 594 goto free_cpu_data; 595 596 ret = sev_cpu_init(sd); 597 if (ret) 598 goto free_save_area; 599 600 per_cpu(svm_data, cpu) = sd; 601 602 return 0; 603 604 free_save_area: 605 __free_page(sd->save_area); 606 free_cpu_data: 607 kfree(sd); 608 return ret; 609 610 } 611 612 static int direct_access_msr_slot(u32 msr) 613 { 614 u32 i; 615 616 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) 617 if (direct_access_msrs[i].index == msr) 618 return i; 619 620 return -ENOENT; 621 } 622 623 static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, 624 int write) 625 { 626 struct vcpu_svm *svm = to_svm(vcpu); 627 int slot = direct_access_msr_slot(msr); 628 629 if (slot == -ENOENT) 630 return; 631 632 /* Set the shadow bitmaps to the desired intercept states */ 633 if (read) 634 set_bit(slot, svm->shadow_msr_intercept.read); 635 else 636 clear_bit(slot, svm->shadow_msr_intercept.read); 637 638 if (write) 639 set_bit(slot, svm->shadow_msr_intercept.write); 640 else 641 clear_bit(slot, svm->shadow_msr_intercept.write); 642 } 643 644 static bool valid_msr_intercept(u32 index) 645 { 646 return direct_access_msr_slot(index) != -ENOENT; 647 } 648 649 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) 650 { 651 u8 bit_write; 652 unsigned long tmp; 653 u32 offset; 654 u32 *msrpm; 655 656 msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: 657 to_svm(vcpu)->msrpm; 658 659 offset = svm_msrpm_offset(msr); 660 bit_write = 2 * (msr & 0x0f) + 1; 661 tmp = msrpm[offset]; 662 663 BUG_ON(offset == MSR_INVALID); 664 665 return !!test_bit(bit_write, &tmp); 666 } 667 668 static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, 669 u32 msr, int read, int write) 670 { 671 u8 bit_read, bit_write; 672 unsigned long tmp; 673 u32 offset; 674 675 /* 676 * If this warning triggers extend the direct_access_msrs list at the 677 * beginning of the file 678 */ 679 WARN_ON(!valid_msr_intercept(msr)); 680 681 /* Enforce non allowed MSRs to trap */ 682 if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) 683 read = 0; 684 685 if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) 686 write = 0; 687 688 offset = svm_msrpm_offset(msr); 689 bit_read = 2 * (msr & 0x0f); 690 bit_write = 2 * (msr & 0x0f) + 1; 691 tmp = msrpm[offset]; 692 693 BUG_ON(offset == MSR_INVALID); 694 695 read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); 696 write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); 697 698 msrpm[offset] = tmp; 699 700 svm_hv_vmcb_dirty_nested_enlightenments(vcpu); 701 702 } 703 704 void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, 705 int read, int write) 706 { 707 set_shadow_msr_intercept(vcpu, msr, read, write); 708 set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); 709 } 710 711 u32 *svm_vcpu_alloc_msrpm(void) 712 { 713 unsigned int order = get_order(MSRPM_SIZE); 714 struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); 715 u32 *msrpm; 716 717 if (!pages) 718 return NULL; 719 720 msrpm = page_address(pages); 721 memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); 722 723 return msrpm; 724 } 725 726 void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) 727 { 728 int i; 729 730 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 731 if (!direct_access_msrs[i].always) 732 continue; 733 set_msr_interception(vcpu, msrpm, direct_access_msrs[i].index, 1, 1); 734 } 735 } 736 737 738 void svm_vcpu_free_msrpm(u32 *msrpm) 739 { 740 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); 741 } 742 743 static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) 744 { 745 struct vcpu_svm *svm = to_svm(vcpu); 746 u32 i; 747 748 /* 749 * Set intercept permissions for all direct access MSRs again. They 750 * will automatically get filtered through the MSR filter, so we are 751 * back in sync after this. 752 */ 753 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 754 u32 msr = direct_access_msrs[i].index; 755 u32 read = test_bit(i, svm->shadow_msr_intercept.read); 756 u32 write = test_bit(i, svm->shadow_msr_intercept.write); 757 758 set_msr_interception_bitmap(vcpu, svm->msrpm, msr, read, write); 759 } 760 } 761 762 static void add_msr_offset(u32 offset) 763 { 764 int i; 765 766 for (i = 0; i < MSRPM_OFFSETS; ++i) { 767 768 /* Offset already in list? */ 769 if (msrpm_offsets[i] == offset) 770 return; 771 772 /* Slot used by another offset? */ 773 if (msrpm_offsets[i] != MSR_INVALID) 774 continue; 775 776 /* Add offset to list */ 777 msrpm_offsets[i] = offset; 778 779 return; 780 } 781 782 /* 783 * If this BUG triggers the msrpm_offsets table has an overflow. Just 784 * increase MSRPM_OFFSETS in this case. 785 */ 786 BUG(); 787 } 788 789 static void init_msrpm_offsets(void) 790 { 791 int i; 792 793 memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); 794 795 for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { 796 u32 offset; 797 798 offset = svm_msrpm_offset(direct_access_msrs[i].index); 799 BUG_ON(offset == MSR_INVALID); 800 801 add_msr_offset(offset); 802 } 803 } 804 805 static void svm_enable_lbrv(struct kvm_vcpu *vcpu) 806 { 807 struct vcpu_svm *svm = to_svm(vcpu); 808 809 svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; 810 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); 811 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); 812 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); 813 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1); 814 } 815 816 static void svm_disable_lbrv(struct kvm_vcpu *vcpu) 817 { 818 struct vcpu_svm *svm = to_svm(vcpu); 819 820 svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; 821 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); 822 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); 823 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); 824 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0); 825 } 826 827 void disable_nmi_singlestep(struct vcpu_svm *svm) 828 { 829 svm->nmi_singlestep = false; 830 831 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { 832 /* Clear our flags if they were not set by the guest */ 833 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 834 svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; 835 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 836 svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; 837 } 838 } 839 840 static void grow_ple_window(struct kvm_vcpu *vcpu) 841 { 842 struct vcpu_svm *svm = to_svm(vcpu); 843 struct vmcb_control_area *control = &svm->vmcb->control; 844 int old = control->pause_filter_count; 845 846 control->pause_filter_count = __grow_ple_window(old, 847 pause_filter_count, 848 pause_filter_count_grow, 849 pause_filter_count_max); 850 851 if (control->pause_filter_count != old) { 852 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 853 trace_kvm_ple_window_update(vcpu->vcpu_id, 854 control->pause_filter_count, old); 855 } 856 } 857 858 static void shrink_ple_window(struct kvm_vcpu *vcpu) 859 { 860 struct vcpu_svm *svm = to_svm(vcpu); 861 struct vmcb_control_area *control = &svm->vmcb->control; 862 int old = control->pause_filter_count; 863 864 control->pause_filter_count = 865 __shrink_ple_window(old, 866 pause_filter_count, 867 pause_filter_count_shrink, 868 pause_filter_count); 869 if (control->pause_filter_count != old) { 870 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 871 trace_kvm_ple_window_update(vcpu->vcpu_id, 872 control->pause_filter_count, old); 873 } 874 } 875 876 /* 877 * The default MMIO mask is a single bit (excluding the present bit), 878 * which could conflict with the memory encryption bit. Check for 879 * memory encryption support and override the default MMIO mask if 880 * memory encryption is enabled. 881 */ 882 static __init void svm_adjust_mmio_mask(void) 883 { 884 unsigned int enc_bit, mask_bit; 885 u64 msr, mask; 886 887 /* If there is no memory encryption support, use existing mask */ 888 if (cpuid_eax(0x80000000) < 0x8000001f) 889 return; 890 891 /* If memory encryption is not enabled, use existing mask */ 892 rdmsrl(MSR_AMD64_SYSCFG, msr); 893 if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) 894 return; 895 896 enc_bit = cpuid_ebx(0x8000001f) & 0x3f; 897 mask_bit = boot_cpu_data.x86_phys_bits; 898 899 /* Increment the mask bit if it is the same as the encryption bit */ 900 if (enc_bit == mask_bit) 901 mask_bit++; 902 903 /* 904 * If the mask bit location is below 52, then some bits above the 905 * physical addressing limit will always be reserved, so use the 906 * rsvd_bits() function to generate the mask. This mask, along with 907 * the present bit, will be used to generate a page fault with 908 * PFER.RSV = 1. 909 * 910 * If the mask bit location is 52 (or above), then clear the mask. 911 */ 912 mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; 913 914 kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); 915 } 916 917 static void svm_hardware_teardown(void) 918 { 919 int cpu; 920 921 sev_hardware_teardown(); 922 923 for_each_possible_cpu(cpu) 924 svm_cpu_uninit(cpu); 925 926 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), 927 get_order(IOPM_SIZE)); 928 iopm_base = 0; 929 } 930 931 static __init void svm_set_cpu_caps(void) 932 { 933 kvm_set_cpu_caps(); 934 935 supported_xss = 0; 936 937 /* CPUID 0x80000001 and 0x8000000A (SVM features) */ 938 if (nested) { 939 kvm_cpu_cap_set(X86_FEATURE_SVM); 940 941 if (nrips) 942 kvm_cpu_cap_set(X86_FEATURE_NRIPS); 943 944 if (npt_enabled) 945 kvm_cpu_cap_set(X86_FEATURE_NPT); 946 947 if (tsc_scaling) 948 kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); 949 950 /* Nested VM can receive #VMEXIT instead of triggering #GP */ 951 kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); 952 } 953 954 /* CPUID 0x80000008 */ 955 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || 956 boot_cpu_has(X86_FEATURE_AMD_SSBD)) 957 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 958 959 /* AMD PMU PERFCTR_CORE CPUID */ 960 if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) 961 kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); 962 963 /* CPUID 0x8000001F (SME/SEV features) */ 964 sev_set_cpu_caps(); 965 } 966 967 static __init int svm_hardware_setup(void) 968 { 969 int cpu; 970 struct page *iopm_pages; 971 void *iopm_va; 972 int r; 973 unsigned int order = get_order(IOPM_SIZE); 974 975 /* 976 * NX is required for shadow paging and for NPT if the NX huge pages 977 * mitigation is enabled. 978 */ 979 if (!boot_cpu_has(X86_FEATURE_NX)) { 980 pr_err_ratelimited("NX (Execute Disable) not supported\n"); 981 return -EOPNOTSUPP; 982 } 983 kvm_enable_efer_bits(EFER_NX); 984 985 iopm_pages = alloc_pages(GFP_KERNEL, order); 986 987 if (!iopm_pages) 988 return -ENOMEM; 989 990 iopm_va = page_address(iopm_pages); 991 memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); 992 iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; 993 994 init_msrpm_offsets(); 995 996 supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); 997 998 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 999 kvm_enable_efer_bits(EFER_FFXSR); 1000 1001 if (tsc_scaling) { 1002 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { 1003 tsc_scaling = false; 1004 } else { 1005 pr_info("TSC scaling supported\n"); 1006 kvm_has_tsc_control = true; 1007 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; 1008 kvm_tsc_scaling_ratio_frac_bits = 32; 1009 } 1010 } 1011 1012 tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); 1013 1014 /* Check for pause filtering support */ 1015 if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { 1016 pause_filter_count = 0; 1017 pause_filter_thresh = 0; 1018 } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { 1019 pause_filter_thresh = 0; 1020 } 1021 1022 if (nested) { 1023 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 1024 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 1025 } 1026 1027 /* 1028 * KVM's MMU doesn't support using 2-level paging for itself, and thus 1029 * NPT isn't supported if the host is using 2-level paging since host 1030 * CR4 is unchanged on VMRUN. 1031 */ 1032 if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) 1033 npt_enabled = false; 1034 1035 if (!boot_cpu_has(X86_FEATURE_NPT)) 1036 npt_enabled = false; 1037 1038 /* Force VM NPT level equal to the host's paging level */ 1039 kvm_configure_mmu(npt_enabled, get_npt_level(), 1040 get_npt_level(), PG_LEVEL_1G); 1041 pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); 1042 1043 /* Note, SEV setup consumes npt_enabled. */ 1044 sev_hardware_setup(); 1045 1046 svm_hv_hardware_setup(); 1047 1048 svm_adjust_mmio_mask(); 1049 1050 for_each_possible_cpu(cpu) { 1051 r = svm_cpu_init(cpu); 1052 if (r) 1053 goto err; 1054 } 1055 1056 if (nrips) { 1057 if (!boot_cpu_has(X86_FEATURE_NRIPS)) 1058 nrips = false; 1059 } 1060 1061 enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC); 1062 1063 if (enable_apicv) { 1064 pr_info("AVIC enabled\n"); 1065 1066 amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); 1067 } 1068 1069 if (vls) { 1070 if (!npt_enabled || 1071 !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || 1072 !IS_ENABLED(CONFIG_X86_64)) { 1073 vls = false; 1074 } else { 1075 pr_info("Virtual VMLOAD VMSAVE supported\n"); 1076 } 1077 } 1078 1079 if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) 1080 svm_gp_erratum_intercept = false; 1081 1082 if (vgif) { 1083 if (!boot_cpu_has(X86_FEATURE_VGIF)) 1084 vgif = false; 1085 else 1086 pr_info("Virtual GIF supported\n"); 1087 } 1088 1089 if (lbrv) { 1090 if (!boot_cpu_has(X86_FEATURE_LBRV)) 1091 lbrv = false; 1092 else 1093 pr_info("LBR virtualization supported\n"); 1094 } 1095 1096 if (!pmu) 1097 pr_info("PMU virtualization is disabled\n"); 1098 1099 svm_set_cpu_caps(); 1100 1101 /* 1102 * It seems that on AMD processors PTE's accessed bit is 1103 * being set by the CPU hardware before the NPF vmexit. 1104 * This is not expected behaviour and our tests fail because 1105 * of it. 1106 * A workaround here is to disable support for 1107 * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. 1108 * In this case userspace can know if there is support using 1109 * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle 1110 * it 1111 * If future AMD CPU models change the behaviour described above, 1112 * this variable can be changed accordingly 1113 */ 1114 allow_smaller_maxphyaddr = !npt_enabled; 1115 1116 return 0; 1117 1118 err: 1119 svm_hardware_teardown(); 1120 return r; 1121 } 1122 1123 static void init_seg(struct vmcb_seg *seg) 1124 { 1125 seg->selector = 0; 1126 seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | 1127 SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ 1128 seg->limit = 0xffff; 1129 seg->base = 0; 1130 } 1131 1132 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) 1133 { 1134 seg->selector = 0; 1135 seg->attrib = SVM_SELECTOR_P_MASK | type; 1136 seg->limit = 0xffff; 1137 seg->base = 0; 1138 } 1139 1140 static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) 1141 { 1142 struct vcpu_svm *svm = to_svm(vcpu); 1143 1144 return svm->nested.ctl.tsc_offset; 1145 } 1146 1147 static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) 1148 { 1149 struct vcpu_svm *svm = to_svm(vcpu); 1150 1151 return svm->tsc_ratio_msr; 1152 } 1153 1154 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1155 { 1156 struct vcpu_svm *svm = to_svm(vcpu); 1157 1158 svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; 1159 svm->vmcb->control.tsc_offset = offset; 1160 vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 1161 } 1162 1163 void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier) 1164 { 1165 wrmsrl(MSR_AMD64_TSC_RATIO, multiplier); 1166 } 1167 1168 /* Evaluate instruction intercepts that depend on guest CPUID features. */ 1169 static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, 1170 struct vcpu_svm *svm) 1171 { 1172 /* 1173 * Intercept INVPCID if shadow paging is enabled to sync/free shadow 1174 * roots, or if INVPCID is disabled in the guest to inject #UD. 1175 */ 1176 if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { 1177 if (!npt_enabled || 1178 !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) 1179 svm_set_intercept(svm, INTERCEPT_INVPCID); 1180 else 1181 svm_clr_intercept(svm, INTERCEPT_INVPCID); 1182 } 1183 1184 if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { 1185 if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) 1186 svm_clr_intercept(svm, INTERCEPT_RDTSCP); 1187 else 1188 svm_set_intercept(svm, INTERCEPT_RDTSCP); 1189 } 1190 } 1191 1192 static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) 1193 { 1194 struct vcpu_svm *svm = to_svm(vcpu); 1195 1196 if (guest_cpuid_is_intel(vcpu)) { 1197 /* 1198 * We must intercept SYSENTER_EIP and SYSENTER_ESP 1199 * accesses because the processor only stores 32 bits. 1200 * For the same reason we cannot use virtual VMLOAD/VMSAVE. 1201 */ 1202 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1203 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1204 svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1205 1206 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0); 1207 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0); 1208 } else { 1209 /* 1210 * If hardware supports Virtual VMLOAD VMSAVE then enable it 1211 * in VMCB and clear intercepts to avoid #VMEXIT. 1212 */ 1213 if (vls) { 1214 svm_clr_intercept(svm, INTERCEPT_VMLOAD); 1215 svm_clr_intercept(svm, INTERCEPT_VMSAVE); 1216 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; 1217 } 1218 /* No need to intercept these MSRs */ 1219 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); 1220 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); 1221 } 1222 } 1223 1224 static void init_vmcb(struct kvm_vcpu *vcpu) 1225 { 1226 struct vcpu_svm *svm = to_svm(vcpu); 1227 struct vmcb_control_area *control = &svm->vmcb->control; 1228 struct vmcb_save_area *save = &svm->vmcb->save; 1229 1230 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1231 svm_set_intercept(svm, INTERCEPT_CR3_READ); 1232 svm_set_intercept(svm, INTERCEPT_CR4_READ); 1233 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1234 svm_set_intercept(svm, INTERCEPT_CR3_WRITE); 1235 svm_set_intercept(svm, INTERCEPT_CR4_WRITE); 1236 if (!kvm_vcpu_apicv_active(vcpu)) 1237 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 1238 1239 set_dr_intercepts(svm); 1240 1241 set_exception_intercept(svm, PF_VECTOR); 1242 set_exception_intercept(svm, UD_VECTOR); 1243 set_exception_intercept(svm, MC_VECTOR); 1244 set_exception_intercept(svm, AC_VECTOR); 1245 set_exception_intercept(svm, DB_VECTOR); 1246 /* 1247 * Guest access to VMware backdoor ports could legitimately 1248 * trigger #GP because of TSS I/O permission bitmap. 1249 * We intercept those #GP and allow access to them anyway 1250 * as VMware does. 1251 */ 1252 if (enable_vmware_backdoor) 1253 set_exception_intercept(svm, GP_VECTOR); 1254 1255 svm_set_intercept(svm, INTERCEPT_INTR); 1256 svm_set_intercept(svm, INTERCEPT_NMI); 1257 1258 if (intercept_smi) 1259 svm_set_intercept(svm, INTERCEPT_SMI); 1260 1261 svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); 1262 svm_set_intercept(svm, INTERCEPT_RDPMC); 1263 svm_set_intercept(svm, INTERCEPT_CPUID); 1264 svm_set_intercept(svm, INTERCEPT_INVD); 1265 svm_set_intercept(svm, INTERCEPT_INVLPG); 1266 svm_set_intercept(svm, INTERCEPT_INVLPGA); 1267 svm_set_intercept(svm, INTERCEPT_IOIO_PROT); 1268 svm_set_intercept(svm, INTERCEPT_MSR_PROT); 1269 svm_set_intercept(svm, INTERCEPT_TASK_SWITCH); 1270 svm_set_intercept(svm, INTERCEPT_SHUTDOWN); 1271 svm_set_intercept(svm, INTERCEPT_VMRUN); 1272 svm_set_intercept(svm, INTERCEPT_VMMCALL); 1273 svm_set_intercept(svm, INTERCEPT_VMLOAD); 1274 svm_set_intercept(svm, INTERCEPT_VMSAVE); 1275 svm_set_intercept(svm, INTERCEPT_STGI); 1276 svm_set_intercept(svm, INTERCEPT_CLGI); 1277 svm_set_intercept(svm, INTERCEPT_SKINIT); 1278 svm_set_intercept(svm, INTERCEPT_WBINVD); 1279 svm_set_intercept(svm, INTERCEPT_XSETBV); 1280 svm_set_intercept(svm, INTERCEPT_RDPRU); 1281 svm_set_intercept(svm, INTERCEPT_RSM); 1282 1283 if (!kvm_mwait_in_guest(vcpu->kvm)) { 1284 svm_set_intercept(svm, INTERCEPT_MONITOR); 1285 svm_set_intercept(svm, INTERCEPT_MWAIT); 1286 } 1287 1288 if (!kvm_hlt_in_guest(vcpu->kvm)) 1289 svm_set_intercept(svm, INTERCEPT_HLT); 1290 1291 control->iopm_base_pa = __sme_set(iopm_base); 1292 control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1293 control->int_ctl = V_INTR_MASKING_MASK; 1294 1295 init_seg(&save->es); 1296 init_seg(&save->ss); 1297 init_seg(&save->ds); 1298 init_seg(&save->fs); 1299 init_seg(&save->gs); 1300 1301 save->cs.selector = 0xf000; 1302 save->cs.base = 0xffff0000; 1303 /* Executable/Readable Code Segment */ 1304 save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | 1305 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; 1306 save->cs.limit = 0xffff; 1307 1308 save->gdtr.base = 0; 1309 save->gdtr.limit = 0xffff; 1310 save->idtr.base = 0; 1311 save->idtr.limit = 0xffff; 1312 1313 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1314 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1315 1316 if (npt_enabled) { 1317 /* Setup VMCB for Nested Paging */ 1318 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; 1319 svm_clr_intercept(svm, INTERCEPT_INVLPG); 1320 clr_exception_intercept(svm, PF_VECTOR); 1321 svm_clr_intercept(svm, INTERCEPT_CR3_READ); 1322 svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); 1323 save->g_pat = vcpu->arch.pat; 1324 save->cr3 = 0; 1325 } 1326 svm->current_vmcb->asid_generation = 0; 1327 svm->asid = 0; 1328 1329 svm->nested.vmcb12_gpa = INVALID_GPA; 1330 svm->nested.last_vmcb12_gpa = INVALID_GPA; 1331 1332 if (!kvm_pause_in_guest(vcpu->kvm)) { 1333 control->pause_filter_count = pause_filter_count; 1334 if (pause_filter_thresh) 1335 control->pause_filter_thresh = pause_filter_thresh; 1336 svm_set_intercept(svm, INTERCEPT_PAUSE); 1337 } else { 1338 svm_clr_intercept(svm, INTERCEPT_PAUSE); 1339 } 1340 1341 svm_recalc_instruction_intercepts(vcpu, svm); 1342 1343 /* 1344 * If the host supports V_SPEC_CTRL then disable the interception 1345 * of MSR_IA32_SPEC_CTRL. 1346 */ 1347 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 1348 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 1349 1350 if (kvm_vcpu_apicv_active(vcpu)) 1351 avic_init_vmcb(svm); 1352 1353 if (vgif) { 1354 svm_clr_intercept(svm, INTERCEPT_STGI); 1355 svm_clr_intercept(svm, INTERCEPT_CLGI); 1356 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; 1357 } 1358 1359 if (sev_guest(vcpu->kvm)) { 1360 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; 1361 clr_exception_intercept(svm, UD_VECTOR); 1362 1363 if (sev_es_guest(vcpu->kvm)) { 1364 /* Perform SEV-ES specific VMCB updates */ 1365 sev_es_init_vmcb(svm); 1366 } 1367 } 1368 1369 svm_hv_init_vmcb(svm->vmcb); 1370 init_vmcb_after_set_cpuid(vcpu); 1371 1372 vmcb_mark_all_dirty(svm->vmcb); 1373 1374 enable_gif(svm); 1375 } 1376 1377 static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) 1378 { 1379 struct vcpu_svm *svm = to_svm(vcpu); 1380 1381 svm_vcpu_init_msrpm(vcpu, svm->msrpm); 1382 1383 svm_init_osvw(vcpu); 1384 vcpu->arch.microcode_version = 0x01000065; 1385 svm->tsc_ratio_msr = kvm_default_tsc_scaling_ratio; 1386 1387 if (sev_es_guest(vcpu->kvm)) 1388 sev_es_vcpu_reset(svm); 1389 } 1390 1391 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) 1392 { 1393 struct vcpu_svm *svm = to_svm(vcpu); 1394 1395 svm->spec_ctrl = 0; 1396 svm->virt_spec_ctrl = 0; 1397 1398 init_vmcb(vcpu); 1399 1400 if (!init_event) 1401 __svm_vcpu_reset(vcpu); 1402 } 1403 1404 void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) 1405 { 1406 svm->current_vmcb = target_vmcb; 1407 svm->vmcb = target_vmcb->ptr; 1408 } 1409 1410 static int svm_create_vcpu(struct kvm_vcpu *vcpu) 1411 { 1412 struct vcpu_svm *svm; 1413 struct page *vmcb01_page; 1414 struct page *vmsa_page = NULL; 1415 int err; 1416 1417 BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); 1418 svm = to_svm(vcpu); 1419 1420 err = -ENOMEM; 1421 vmcb01_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1422 if (!vmcb01_page) 1423 goto out; 1424 1425 if (sev_es_guest(vcpu->kvm)) { 1426 /* 1427 * SEV-ES guests require a separate VMSA page used to contain 1428 * the encrypted register state of the guest. 1429 */ 1430 vmsa_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 1431 if (!vmsa_page) 1432 goto error_free_vmcb_page; 1433 1434 /* 1435 * SEV-ES guests maintain an encrypted version of their FPU 1436 * state which is restored and saved on VMRUN and VMEXIT. 1437 * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't 1438 * do xsave/xrstor on it. 1439 */ 1440 fpstate_set_confidential(&vcpu->arch.guest_fpu); 1441 } 1442 1443 err = avic_init_vcpu(svm); 1444 if (err) 1445 goto error_free_vmsa_page; 1446 1447 /* We initialize this flag to true to make sure that the is_running 1448 * bit would be set the first time the vcpu is loaded. 1449 */ 1450 if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) 1451 svm->avic_is_running = true; 1452 1453 svm->msrpm = svm_vcpu_alloc_msrpm(); 1454 if (!svm->msrpm) { 1455 err = -ENOMEM; 1456 goto error_free_vmsa_page; 1457 } 1458 1459 svm->vmcb01.ptr = page_address(vmcb01_page); 1460 svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); 1461 svm_switch_vmcb(svm, &svm->vmcb01); 1462 1463 if (vmsa_page) 1464 svm->sev_es.vmsa = page_address(vmsa_page); 1465 1466 svm->guest_state_loaded = false; 1467 1468 return 0; 1469 1470 error_free_vmsa_page: 1471 if (vmsa_page) 1472 __free_page(vmsa_page); 1473 error_free_vmcb_page: 1474 __free_page(vmcb01_page); 1475 out: 1476 return err; 1477 } 1478 1479 static void svm_clear_current_vmcb(struct vmcb *vmcb) 1480 { 1481 int i; 1482 1483 for_each_online_cpu(i) 1484 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); 1485 } 1486 1487 static void svm_free_vcpu(struct kvm_vcpu *vcpu) 1488 { 1489 struct vcpu_svm *svm = to_svm(vcpu); 1490 1491 /* 1492 * The vmcb page can be recycled, causing a false negative in 1493 * svm_vcpu_load(). So, ensure that no logical CPU has this 1494 * vmcb page recorded as its current vmcb. 1495 */ 1496 svm_clear_current_vmcb(svm->vmcb); 1497 1498 svm_free_nested(svm); 1499 1500 sev_free_vcpu(vcpu); 1501 1502 __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); 1503 __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); 1504 } 1505 1506 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 1507 { 1508 struct vcpu_svm *svm = to_svm(vcpu); 1509 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 1510 1511 if (sev_es_guest(vcpu->kvm)) 1512 sev_es_unmap_ghcb(svm); 1513 1514 if (svm->guest_state_loaded) 1515 return; 1516 1517 /* 1518 * Save additional host state that will be restored on VMEXIT (sev-es) 1519 * or subsequent vmload of host save area. 1520 */ 1521 if (sev_es_guest(vcpu->kvm)) { 1522 sev_es_prepare_guest_switch(svm, vcpu->cpu); 1523 } else { 1524 vmsave(__sme_page_pa(sd->save_area)); 1525 } 1526 1527 if (tsc_scaling) { 1528 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; 1529 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { 1530 __this_cpu_write(current_tsc_ratio, tsc_ratio); 1531 wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); 1532 } 1533 } 1534 1535 if (likely(tsc_aux_uret_slot >= 0)) 1536 kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); 1537 1538 svm->guest_state_loaded = true; 1539 } 1540 1541 static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) 1542 { 1543 to_svm(vcpu)->guest_state_loaded = false; 1544 } 1545 1546 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1547 { 1548 struct vcpu_svm *svm = to_svm(vcpu); 1549 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 1550 1551 if (sd->current_vmcb != svm->vmcb) { 1552 sd->current_vmcb = svm->vmcb; 1553 indirect_branch_prediction_barrier(); 1554 } 1555 if (kvm_vcpu_apicv_active(vcpu)) 1556 avic_vcpu_load(vcpu, cpu); 1557 } 1558 1559 static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1560 { 1561 if (kvm_vcpu_apicv_active(vcpu)) 1562 avic_vcpu_put(vcpu); 1563 1564 svm_prepare_host_switch(vcpu); 1565 1566 ++vcpu->stat.host_state_reload; 1567 } 1568 1569 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1570 { 1571 struct vcpu_svm *svm = to_svm(vcpu); 1572 unsigned long rflags = svm->vmcb->save.rflags; 1573 1574 if (svm->nmi_singlestep) { 1575 /* Hide our flags if they were not set by the guest */ 1576 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) 1577 rflags &= ~X86_EFLAGS_TF; 1578 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) 1579 rflags &= ~X86_EFLAGS_RF; 1580 } 1581 return rflags; 1582 } 1583 1584 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1585 { 1586 if (to_svm(vcpu)->nmi_singlestep) 1587 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 1588 1589 /* 1590 * Any change of EFLAGS.VM is accompanied by a reload of SS 1591 * (caused by either a task switch or an inter-privilege IRET), 1592 * so we do not need to update the CPL here. 1593 */ 1594 to_svm(vcpu)->vmcb->save.rflags = rflags; 1595 } 1596 1597 static bool svm_get_if_flag(struct kvm_vcpu *vcpu) 1598 { 1599 struct vmcb *vmcb = to_svm(vcpu)->vmcb; 1600 1601 return sev_es_guest(vcpu->kvm) 1602 ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK 1603 : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; 1604 } 1605 1606 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 1607 { 1608 kvm_register_mark_available(vcpu, reg); 1609 1610 switch (reg) { 1611 case VCPU_EXREG_PDPTR: 1612 /* 1613 * When !npt_enabled, mmu->pdptrs[] is already available since 1614 * it is always updated per SDM when moving to CRs. 1615 */ 1616 if (npt_enabled) 1617 load_pdptrs(vcpu, kvm_read_cr3(vcpu)); 1618 break; 1619 default: 1620 KVM_BUG_ON(1, vcpu->kvm); 1621 } 1622 } 1623 1624 static void svm_set_vintr(struct vcpu_svm *svm) 1625 { 1626 struct vmcb_control_area *control; 1627 1628 /* 1629 * The following fields are ignored when AVIC is enabled 1630 */ 1631 WARN_ON(kvm_apicv_activated(svm->vcpu.kvm)); 1632 1633 svm_set_intercept(svm, INTERCEPT_VINTR); 1634 1635 /* 1636 * This is just a dummy VINTR to actually cause a vmexit to happen. 1637 * Actual injection of virtual interrupts happens through EVENTINJ. 1638 */ 1639 control = &svm->vmcb->control; 1640 control->int_vector = 0x0; 1641 control->int_ctl &= ~V_INTR_PRIO_MASK; 1642 control->int_ctl |= V_IRQ_MASK | 1643 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 1644 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1645 } 1646 1647 static void svm_clear_vintr(struct vcpu_svm *svm) 1648 { 1649 svm_clr_intercept(svm, INTERCEPT_VINTR); 1650 1651 /* Drop int_ctl fields related to VINTR injection. */ 1652 svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1653 if (is_guest_mode(&svm->vcpu)) { 1654 svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; 1655 1656 WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != 1657 (svm->nested.ctl.int_ctl & V_TPR_MASK)); 1658 1659 svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & 1660 V_IRQ_INJECTION_BITS_MASK; 1661 1662 svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; 1663 } 1664 1665 vmcb_mark_dirty(svm->vmcb, VMCB_INTR); 1666 } 1667 1668 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1669 { 1670 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1671 struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save; 1672 1673 switch (seg) { 1674 case VCPU_SREG_CS: return &save->cs; 1675 case VCPU_SREG_DS: return &save->ds; 1676 case VCPU_SREG_ES: return &save->es; 1677 case VCPU_SREG_FS: return &save01->fs; 1678 case VCPU_SREG_GS: return &save01->gs; 1679 case VCPU_SREG_SS: return &save->ss; 1680 case VCPU_SREG_TR: return &save01->tr; 1681 case VCPU_SREG_LDTR: return &save01->ldtr; 1682 } 1683 BUG(); 1684 return NULL; 1685 } 1686 1687 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) 1688 { 1689 struct vmcb_seg *s = svm_seg(vcpu, seg); 1690 1691 return s->base; 1692 } 1693 1694 static void svm_get_segment(struct kvm_vcpu *vcpu, 1695 struct kvm_segment *var, int seg) 1696 { 1697 struct vmcb_seg *s = svm_seg(vcpu, seg); 1698 1699 var->base = s->base; 1700 var->limit = s->limit; 1701 var->selector = s->selector; 1702 var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; 1703 var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; 1704 var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; 1705 var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; 1706 var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; 1707 var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; 1708 var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; 1709 1710 /* 1711 * AMD CPUs circa 2014 track the G bit for all segments except CS. 1712 * However, the SVM spec states that the G bit is not observed by the 1713 * CPU, and some VMware virtual CPUs drop the G bit for all segments. 1714 * So let's synthesize a legal G bit for all segments, this helps 1715 * running KVM nested. It also helps cross-vendor migration, because 1716 * Intel's vmentry has a check on the 'G' bit. 1717 */ 1718 var->g = s->limit > 0xfffff; 1719 1720 /* 1721 * AMD's VMCB does not have an explicit unusable field, so emulate it 1722 * for cross vendor migration purposes by "not present" 1723 */ 1724 var->unusable = !var->present; 1725 1726 switch (seg) { 1727 case VCPU_SREG_TR: 1728 /* 1729 * Work around a bug where the busy flag in the tr selector 1730 * isn't exposed 1731 */ 1732 var->type |= 0x2; 1733 break; 1734 case VCPU_SREG_DS: 1735 case VCPU_SREG_ES: 1736 case VCPU_SREG_FS: 1737 case VCPU_SREG_GS: 1738 /* 1739 * The accessed bit must always be set in the segment 1740 * descriptor cache, although it can be cleared in the 1741 * descriptor, the cached bit always remains at 1. Since 1742 * Intel has a check on this, set it here to support 1743 * cross-vendor migration. 1744 */ 1745 if (!var->unusable) 1746 var->type |= 0x1; 1747 break; 1748 case VCPU_SREG_SS: 1749 /* 1750 * On AMD CPUs sometimes the DB bit in the segment 1751 * descriptor is left as 1, although the whole segment has 1752 * been made unusable. Clear it here to pass an Intel VMX 1753 * entry check when cross vendor migrating. 1754 */ 1755 if (var->unusable) 1756 var->db = 0; 1757 /* This is symmetric with svm_set_segment() */ 1758 var->dpl = to_svm(vcpu)->vmcb->save.cpl; 1759 break; 1760 } 1761 } 1762 1763 static int svm_get_cpl(struct kvm_vcpu *vcpu) 1764 { 1765 struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; 1766 1767 return save->cpl; 1768 } 1769 1770 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1771 { 1772 struct vcpu_svm *svm = to_svm(vcpu); 1773 1774 dt->size = svm->vmcb->save.idtr.limit; 1775 dt->address = svm->vmcb->save.idtr.base; 1776 } 1777 1778 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1779 { 1780 struct vcpu_svm *svm = to_svm(vcpu); 1781 1782 svm->vmcb->save.idtr.limit = dt->size; 1783 svm->vmcb->save.idtr.base = dt->address ; 1784 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1785 } 1786 1787 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1788 { 1789 struct vcpu_svm *svm = to_svm(vcpu); 1790 1791 dt->size = svm->vmcb->save.gdtr.limit; 1792 dt->address = svm->vmcb->save.gdtr.base; 1793 } 1794 1795 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1796 { 1797 struct vcpu_svm *svm = to_svm(vcpu); 1798 1799 svm->vmcb->save.gdtr.limit = dt->size; 1800 svm->vmcb->save.gdtr.base = dt->address ; 1801 vmcb_mark_dirty(svm->vmcb, VMCB_DT); 1802 } 1803 1804 static void svm_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 1805 { 1806 struct vcpu_svm *svm = to_svm(vcpu); 1807 1808 /* 1809 * For guests that don't set guest_state_protected, the cr3 update is 1810 * handled via kvm_mmu_load() while entering the guest. For guests 1811 * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to 1812 * VMCB save area now, since the save area will become the initial 1813 * contents of the VMSA, and future VMCB save area updates won't be 1814 * seen. 1815 */ 1816 if (sev_es_guest(vcpu->kvm)) { 1817 svm->vmcb->save.cr3 = cr3; 1818 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1819 } 1820 } 1821 1822 void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1823 { 1824 struct vcpu_svm *svm = to_svm(vcpu); 1825 u64 hcr0 = cr0; 1826 1827 #ifdef CONFIG_X86_64 1828 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { 1829 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1830 vcpu->arch.efer |= EFER_LMA; 1831 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1832 } 1833 1834 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1835 vcpu->arch.efer &= ~EFER_LMA; 1836 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1837 } 1838 } 1839 #endif 1840 vcpu->arch.cr0 = cr0; 1841 1842 if (!npt_enabled) 1843 hcr0 |= X86_CR0_PG | X86_CR0_WP; 1844 1845 /* 1846 * re-enable caching here because the QEMU bios 1847 * does not do it - this results in some delay at 1848 * reboot 1849 */ 1850 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) 1851 hcr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1852 1853 svm->vmcb->save.cr0 = hcr0; 1854 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 1855 1856 /* 1857 * SEV-ES guests must always keep the CR intercepts cleared. CR 1858 * tracking is done using the CR write traps. 1859 */ 1860 if (sev_es_guest(vcpu->kvm)) 1861 return; 1862 1863 if (hcr0 == cr0) { 1864 /* Selective CR0 write remains on. */ 1865 svm_clr_intercept(svm, INTERCEPT_CR0_READ); 1866 svm_clr_intercept(svm, INTERCEPT_CR0_WRITE); 1867 } else { 1868 svm_set_intercept(svm, INTERCEPT_CR0_READ); 1869 svm_set_intercept(svm, INTERCEPT_CR0_WRITE); 1870 } 1871 } 1872 1873 static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1874 { 1875 return true; 1876 } 1877 1878 void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1879 { 1880 unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; 1881 unsigned long old_cr4 = vcpu->arch.cr4; 1882 1883 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1884 svm_flush_tlb(vcpu); 1885 1886 vcpu->arch.cr4 = cr4; 1887 if (!npt_enabled) 1888 cr4 |= X86_CR4_PAE; 1889 cr4 |= host_cr4_mce; 1890 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1891 vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); 1892 1893 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) 1894 kvm_update_cpuid_runtime(vcpu); 1895 } 1896 1897 static void svm_set_segment(struct kvm_vcpu *vcpu, 1898 struct kvm_segment *var, int seg) 1899 { 1900 struct vcpu_svm *svm = to_svm(vcpu); 1901 struct vmcb_seg *s = svm_seg(vcpu, seg); 1902 1903 s->base = var->base; 1904 s->limit = var->limit; 1905 s->selector = var->selector; 1906 s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); 1907 s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; 1908 s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; 1909 s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; 1910 s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; 1911 s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; 1912 s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; 1913 s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; 1914 1915 /* 1916 * This is always accurate, except if SYSRET returned to a segment 1917 * with SS.DPL != 3. Intel does not have this quirk, and always 1918 * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it 1919 * would entail passing the CPL to userspace and back. 1920 */ 1921 if (seg == VCPU_SREG_SS) 1922 /* This is symmetric with svm_get_segment() */ 1923 svm->vmcb->save.cpl = (var->dpl & 3); 1924 1925 vmcb_mark_dirty(svm->vmcb, VMCB_SEG); 1926 } 1927 1928 static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu) 1929 { 1930 struct vcpu_svm *svm = to_svm(vcpu); 1931 1932 clr_exception_intercept(svm, BP_VECTOR); 1933 1934 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1935 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1936 set_exception_intercept(svm, BP_VECTOR); 1937 } 1938 } 1939 1940 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1941 { 1942 if (sd->next_asid > sd->max_asid) { 1943 ++sd->asid_generation; 1944 sd->next_asid = sd->min_asid; 1945 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1946 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 1947 } 1948 1949 svm->current_vmcb->asid_generation = sd->asid_generation; 1950 svm->asid = sd->next_asid++; 1951 } 1952 1953 static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) 1954 { 1955 struct vmcb *vmcb = svm->vmcb; 1956 1957 if (svm->vcpu.arch.guest_state_protected) 1958 return; 1959 1960 if (unlikely(value != vmcb->save.dr6)) { 1961 vmcb->save.dr6 = value; 1962 vmcb_mark_dirty(vmcb, VMCB_DR); 1963 } 1964 } 1965 1966 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 1967 { 1968 struct vcpu_svm *svm = to_svm(vcpu); 1969 1970 if (vcpu->arch.guest_state_protected) 1971 return; 1972 1973 get_debugreg(vcpu->arch.db[0], 0); 1974 get_debugreg(vcpu->arch.db[1], 1); 1975 get_debugreg(vcpu->arch.db[2], 2); 1976 get_debugreg(vcpu->arch.db[3], 3); 1977 /* 1978 * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here, 1979 * because db_interception might need it. We can do it before vmentry. 1980 */ 1981 vcpu->arch.dr6 = svm->vmcb->save.dr6; 1982 vcpu->arch.dr7 = svm->vmcb->save.dr7; 1983 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 1984 set_dr_intercepts(svm); 1985 } 1986 1987 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1988 { 1989 struct vcpu_svm *svm = to_svm(vcpu); 1990 1991 if (vcpu->arch.guest_state_protected) 1992 return; 1993 1994 svm->vmcb->save.dr7 = value; 1995 vmcb_mark_dirty(svm->vmcb, VMCB_DR); 1996 } 1997 1998 static int pf_interception(struct kvm_vcpu *vcpu) 1999 { 2000 struct vcpu_svm *svm = to_svm(vcpu); 2001 2002 u64 fault_address = svm->vmcb->control.exit_info_2; 2003 u64 error_code = svm->vmcb->control.exit_info_1; 2004 2005 return kvm_handle_page_fault(vcpu, error_code, fault_address, 2006 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 2007 svm->vmcb->control.insn_bytes : NULL, 2008 svm->vmcb->control.insn_len); 2009 } 2010 2011 static int npf_interception(struct kvm_vcpu *vcpu) 2012 { 2013 struct vcpu_svm *svm = to_svm(vcpu); 2014 2015 u64 fault_address = svm->vmcb->control.exit_info_2; 2016 u64 error_code = svm->vmcb->control.exit_info_1; 2017 2018 trace_kvm_page_fault(fault_address, error_code); 2019 return kvm_mmu_page_fault(vcpu, fault_address, error_code, 2020 static_cpu_has(X86_FEATURE_DECODEASSISTS) ? 2021 svm->vmcb->control.insn_bytes : NULL, 2022 svm->vmcb->control.insn_len); 2023 } 2024 2025 static int db_interception(struct kvm_vcpu *vcpu) 2026 { 2027 struct kvm_run *kvm_run = vcpu->run; 2028 struct vcpu_svm *svm = to_svm(vcpu); 2029 2030 if (!(vcpu->guest_debug & 2031 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 2032 !svm->nmi_singlestep) { 2033 u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; 2034 kvm_queue_exception_p(vcpu, DB_VECTOR, payload); 2035 return 1; 2036 } 2037 2038 if (svm->nmi_singlestep) { 2039 disable_nmi_singlestep(svm); 2040 /* Make sure we check for pending NMIs upon entry */ 2041 kvm_make_request(KVM_REQ_EVENT, vcpu); 2042 } 2043 2044 if (vcpu->guest_debug & 2045 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { 2046 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2047 kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; 2048 kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; 2049 kvm_run->debug.arch.pc = 2050 svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2051 kvm_run->debug.arch.exception = DB_VECTOR; 2052 return 0; 2053 } 2054 2055 return 1; 2056 } 2057 2058 static int bp_interception(struct kvm_vcpu *vcpu) 2059 { 2060 struct vcpu_svm *svm = to_svm(vcpu); 2061 struct kvm_run *kvm_run = vcpu->run; 2062 2063 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2064 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 2065 kvm_run->debug.arch.exception = BP_VECTOR; 2066 return 0; 2067 } 2068 2069 static int ud_interception(struct kvm_vcpu *vcpu) 2070 { 2071 return handle_ud(vcpu); 2072 } 2073 2074 static int ac_interception(struct kvm_vcpu *vcpu) 2075 { 2076 kvm_queue_exception_e(vcpu, AC_VECTOR, 0); 2077 return 1; 2078 } 2079 2080 static bool is_erratum_383(void) 2081 { 2082 int err, i; 2083 u64 value; 2084 2085 if (!erratum_383_found) 2086 return false; 2087 2088 value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); 2089 if (err) 2090 return false; 2091 2092 /* Bit 62 may or may not be set for this mce */ 2093 value &= ~(1ULL << 62); 2094 2095 if (value != 0xb600000000010015ULL) 2096 return false; 2097 2098 /* Clear MCi_STATUS registers */ 2099 for (i = 0; i < 6; ++i) 2100 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); 2101 2102 value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); 2103 if (!err) { 2104 u32 low, high; 2105 2106 value &= ~(1ULL << 2); 2107 low = lower_32_bits(value); 2108 high = upper_32_bits(value); 2109 2110 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); 2111 } 2112 2113 /* Flush tlb to evict multi-match entries */ 2114 __flush_tlb_all(); 2115 2116 return true; 2117 } 2118 2119 static void svm_handle_mce(struct kvm_vcpu *vcpu) 2120 { 2121 if (is_erratum_383()) { 2122 /* 2123 * Erratum 383 triggered. Guest state is corrupt so kill the 2124 * guest. 2125 */ 2126 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 2127 2128 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2129 2130 return; 2131 } 2132 2133 /* 2134 * On an #MC intercept the MCE handler is not called automatically in 2135 * the host. So do it by hand here. 2136 */ 2137 kvm_machine_check(); 2138 } 2139 2140 static int mc_interception(struct kvm_vcpu *vcpu) 2141 { 2142 return 1; 2143 } 2144 2145 static int shutdown_interception(struct kvm_vcpu *vcpu) 2146 { 2147 struct kvm_run *kvm_run = vcpu->run; 2148 struct vcpu_svm *svm = to_svm(vcpu); 2149 2150 /* 2151 * The VM save area has already been encrypted so it 2152 * cannot be reinitialized - just terminate. 2153 */ 2154 if (sev_es_guest(vcpu->kvm)) 2155 return -EINVAL; 2156 2157 /* 2158 * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put 2159 * the VMCB in a known good state. Unfortuately, KVM doesn't have 2160 * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking 2161 * userspace. At a platform view, INIT is acceptable behavior as 2162 * there exist bare metal platforms that automatically INIT the CPU 2163 * in response to shutdown. 2164 */ 2165 clear_page(svm->vmcb); 2166 kvm_vcpu_reset(vcpu, true); 2167 2168 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2169 return 0; 2170 } 2171 2172 static int io_interception(struct kvm_vcpu *vcpu) 2173 { 2174 struct vcpu_svm *svm = to_svm(vcpu); 2175 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 2176 int size, in, string; 2177 unsigned port; 2178 2179 ++vcpu->stat.io_exits; 2180 string = (io_info & SVM_IOIO_STR_MASK) != 0; 2181 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 2182 port = io_info >> 16; 2183 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 2184 2185 if (string) { 2186 if (sev_es_guest(vcpu->kvm)) 2187 return sev_es_string_io(svm, size, port, in); 2188 else 2189 return kvm_emulate_instruction(vcpu, 0); 2190 } 2191 2192 svm->next_rip = svm->vmcb->control.exit_info_2; 2193 2194 return kvm_fast_pio(vcpu, size, port, in); 2195 } 2196 2197 static int nmi_interception(struct kvm_vcpu *vcpu) 2198 { 2199 return 1; 2200 } 2201 2202 static int smi_interception(struct kvm_vcpu *vcpu) 2203 { 2204 return 1; 2205 } 2206 2207 static int intr_interception(struct kvm_vcpu *vcpu) 2208 { 2209 ++vcpu->stat.irq_exits; 2210 return 1; 2211 } 2212 2213 static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) 2214 { 2215 struct vcpu_svm *svm = to_svm(vcpu); 2216 struct vmcb *vmcb12; 2217 struct kvm_host_map map; 2218 int ret; 2219 2220 if (nested_svm_check_permissions(vcpu)) 2221 return 1; 2222 2223 ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); 2224 if (ret) { 2225 if (ret == -EINVAL) 2226 kvm_inject_gp(vcpu, 0); 2227 return 1; 2228 } 2229 2230 vmcb12 = map.hva; 2231 2232 ret = kvm_skip_emulated_instruction(vcpu); 2233 2234 if (vmload) { 2235 svm_copy_vmloadsave_state(svm->vmcb, vmcb12); 2236 svm->sysenter_eip_hi = 0; 2237 svm->sysenter_esp_hi = 0; 2238 } else { 2239 svm_copy_vmloadsave_state(vmcb12, svm->vmcb); 2240 } 2241 2242 kvm_vcpu_unmap(vcpu, &map, true); 2243 2244 return ret; 2245 } 2246 2247 static int vmload_interception(struct kvm_vcpu *vcpu) 2248 { 2249 return vmload_vmsave_interception(vcpu, true); 2250 } 2251 2252 static int vmsave_interception(struct kvm_vcpu *vcpu) 2253 { 2254 return vmload_vmsave_interception(vcpu, false); 2255 } 2256 2257 static int vmrun_interception(struct kvm_vcpu *vcpu) 2258 { 2259 if (nested_svm_check_permissions(vcpu)) 2260 return 1; 2261 2262 return nested_svm_vmrun(vcpu); 2263 } 2264 2265 enum { 2266 NONE_SVM_INSTR, 2267 SVM_INSTR_VMRUN, 2268 SVM_INSTR_VMLOAD, 2269 SVM_INSTR_VMSAVE, 2270 }; 2271 2272 /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ 2273 static int svm_instr_opcode(struct kvm_vcpu *vcpu) 2274 { 2275 struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; 2276 2277 if (ctxt->b != 0x1 || ctxt->opcode_len != 2) 2278 return NONE_SVM_INSTR; 2279 2280 switch (ctxt->modrm) { 2281 case 0xd8: /* VMRUN */ 2282 return SVM_INSTR_VMRUN; 2283 case 0xda: /* VMLOAD */ 2284 return SVM_INSTR_VMLOAD; 2285 case 0xdb: /* VMSAVE */ 2286 return SVM_INSTR_VMSAVE; 2287 default: 2288 break; 2289 } 2290 2291 return NONE_SVM_INSTR; 2292 } 2293 2294 static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) 2295 { 2296 const int guest_mode_exit_codes[] = { 2297 [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, 2298 [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, 2299 [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, 2300 }; 2301 int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { 2302 [SVM_INSTR_VMRUN] = vmrun_interception, 2303 [SVM_INSTR_VMLOAD] = vmload_interception, 2304 [SVM_INSTR_VMSAVE] = vmsave_interception, 2305 }; 2306 struct vcpu_svm *svm = to_svm(vcpu); 2307 int ret; 2308 2309 if (is_guest_mode(vcpu)) { 2310 /* Returns '1' or -errno on failure, '0' on success. */ 2311 ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); 2312 if (ret) 2313 return ret; 2314 return 1; 2315 } 2316 return svm_instr_handlers[opcode](vcpu); 2317 } 2318 2319 /* 2320 * #GP handling code. Note that #GP can be triggered under the following two 2321 * cases: 2322 * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on 2323 * some AMD CPUs when EAX of these instructions are in the reserved memory 2324 * regions (e.g. SMM memory on host). 2325 * 2) VMware backdoor 2326 */ 2327 static int gp_interception(struct kvm_vcpu *vcpu) 2328 { 2329 struct vcpu_svm *svm = to_svm(vcpu); 2330 u32 error_code = svm->vmcb->control.exit_info_1; 2331 int opcode; 2332 2333 /* Both #GP cases have zero error_code */ 2334 if (error_code) 2335 goto reinject; 2336 2337 /* All SVM instructions expect page aligned RAX */ 2338 if (svm->vmcb->save.rax & ~PAGE_MASK) 2339 goto reinject; 2340 2341 /* Decode the instruction for usage later */ 2342 if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) 2343 goto reinject; 2344 2345 opcode = svm_instr_opcode(vcpu); 2346 2347 if (opcode == NONE_SVM_INSTR) { 2348 if (!enable_vmware_backdoor) 2349 goto reinject; 2350 2351 /* 2352 * VMware backdoor emulation on #GP interception only handles 2353 * IN{S}, OUT{S}, and RDPMC. 2354 */ 2355 if (!is_guest_mode(vcpu)) 2356 return kvm_emulate_instruction(vcpu, 2357 EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); 2358 } else 2359 return emulate_svm_instr(vcpu, opcode); 2360 2361 reinject: 2362 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); 2363 return 1; 2364 } 2365 2366 void svm_set_gif(struct vcpu_svm *svm, bool value) 2367 { 2368 if (value) { 2369 /* 2370 * If VGIF is enabled, the STGI intercept is only added to 2371 * detect the opening of the SMI/NMI window; remove it now. 2372 * Likewise, clear the VINTR intercept, we will set it 2373 * again while processing KVM_REQ_EVENT if needed. 2374 */ 2375 if (vgif_enabled(svm)) 2376 svm_clr_intercept(svm, INTERCEPT_STGI); 2377 if (svm_is_intercept(svm, INTERCEPT_VINTR)) 2378 svm_clear_vintr(svm); 2379 2380 enable_gif(svm); 2381 if (svm->vcpu.arch.smi_pending || 2382 svm->vcpu.arch.nmi_pending || 2383 kvm_cpu_has_injectable_intr(&svm->vcpu)) 2384 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); 2385 } else { 2386 disable_gif(svm); 2387 2388 /* 2389 * After a CLGI no interrupts should come. But if vGIF is 2390 * in use, we still rely on the VINTR intercept (rather than 2391 * STGI) to detect an open interrupt window. 2392 */ 2393 if (!vgif_enabled(svm)) 2394 svm_clear_vintr(svm); 2395 } 2396 } 2397 2398 static int stgi_interception(struct kvm_vcpu *vcpu) 2399 { 2400 int ret; 2401 2402 if (nested_svm_check_permissions(vcpu)) 2403 return 1; 2404 2405 ret = kvm_skip_emulated_instruction(vcpu); 2406 svm_set_gif(to_svm(vcpu), true); 2407 return ret; 2408 } 2409 2410 static int clgi_interception(struct kvm_vcpu *vcpu) 2411 { 2412 int ret; 2413 2414 if (nested_svm_check_permissions(vcpu)) 2415 return 1; 2416 2417 ret = kvm_skip_emulated_instruction(vcpu); 2418 svm_set_gif(to_svm(vcpu), false); 2419 return ret; 2420 } 2421 2422 static int invlpga_interception(struct kvm_vcpu *vcpu) 2423 { 2424 gva_t gva = kvm_rax_read(vcpu); 2425 u32 asid = kvm_rcx_read(vcpu); 2426 2427 /* FIXME: Handle an address size prefix. */ 2428 if (!is_long_mode(vcpu)) 2429 gva = (u32)gva; 2430 2431 trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva); 2432 2433 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 2434 kvm_mmu_invlpg(vcpu, gva); 2435 2436 return kvm_skip_emulated_instruction(vcpu); 2437 } 2438 2439 static int skinit_interception(struct kvm_vcpu *vcpu) 2440 { 2441 trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu)); 2442 2443 kvm_queue_exception(vcpu, UD_VECTOR); 2444 return 1; 2445 } 2446 2447 static int task_switch_interception(struct kvm_vcpu *vcpu) 2448 { 2449 struct vcpu_svm *svm = to_svm(vcpu); 2450 u16 tss_selector; 2451 int reason; 2452 int int_type = svm->vmcb->control.exit_int_info & 2453 SVM_EXITINTINFO_TYPE_MASK; 2454 int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; 2455 uint32_t type = 2456 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; 2457 uint32_t idt_v = 2458 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; 2459 bool has_error_code = false; 2460 u32 error_code = 0; 2461 2462 tss_selector = (u16)svm->vmcb->control.exit_info_1; 2463 2464 if (svm->vmcb->control.exit_info_2 & 2465 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) 2466 reason = TASK_SWITCH_IRET; 2467 else if (svm->vmcb->control.exit_info_2 & 2468 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) 2469 reason = TASK_SWITCH_JMP; 2470 else if (idt_v) 2471 reason = TASK_SWITCH_GATE; 2472 else 2473 reason = TASK_SWITCH_CALL; 2474 2475 if (reason == TASK_SWITCH_GATE) { 2476 switch (type) { 2477 case SVM_EXITINTINFO_TYPE_NMI: 2478 vcpu->arch.nmi_injected = false; 2479 break; 2480 case SVM_EXITINTINFO_TYPE_EXEPT: 2481 if (svm->vmcb->control.exit_info_2 & 2482 (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { 2483 has_error_code = true; 2484 error_code = 2485 (u32)svm->vmcb->control.exit_info_2; 2486 } 2487 kvm_clear_exception_queue(vcpu); 2488 break; 2489 case SVM_EXITINTINFO_TYPE_INTR: 2490 kvm_clear_interrupt_queue(vcpu); 2491 break; 2492 default: 2493 break; 2494 } 2495 } 2496 2497 if (reason != TASK_SWITCH_GATE || 2498 int_type == SVM_EXITINTINFO_TYPE_SOFT || 2499 (int_type == SVM_EXITINTINFO_TYPE_EXEPT && 2500 (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { 2501 if (!skip_emulated_instruction(vcpu)) 2502 return 0; 2503 } 2504 2505 if (int_type != SVM_EXITINTINFO_TYPE_SOFT) 2506 int_vec = -1; 2507 2508 return kvm_task_switch(vcpu, tss_selector, int_vec, reason, 2509 has_error_code, error_code); 2510 } 2511 2512 static int iret_interception(struct kvm_vcpu *vcpu) 2513 { 2514 struct vcpu_svm *svm = to_svm(vcpu); 2515 2516 ++vcpu->stat.nmi_window_exits; 2517 vcpu->arch.hflags |= HF_IRET_MASK; 2518 if (!sev_es_guest(vcpu->kvm)) { 2519 svm_clr_intercept(svm, INTERCEPT_IRET); 2520 svm->nmi_iret_rip = kvm_rip_read(vcpu); 2521 } 2522 kvm_make_request(KVM_REQ_EVENT, vcpu); 2523 return 1; 2524 } 2525 2526 static int invlpg_interception(struct kvm_vcpu *vcpu) 2527 { 2528 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2529 return kvm_emulate_instruction(vcpu, 0); 2530 2531 kvm_mmu_invlpg(vcpu, to_svm(vcpu)->vmcb->control.exit_info_1); 2532 return kvm_skip_emulated_instruction(vcpu); 2533 } 2534 2535 static int emulate_on_interception(struct kvm_vcpu *vcpu) 2536 { 2537 return kvm_emulate_instruction(vcpu, 0); 2538 } 2539 2540 static int rsm_interception(struct kvm_vcpu *vcpu) 2541 { 2542 return kvm_emulate_instruction_from_buffer(vcpu, rsm_ins_bytes, 2); 2543 } 2544 2545 static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, 2546 unsigned long val) 2547 { 2548 struct vcpu_svm *svm = to_svm(vcpu); 2549 unsigned long cr0 = vcpu->arch.cr0; 2550 bool ret = false; 2551 2552 if (!is_guest_mode(vcpu) || 2553 (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SELECTIVE_CR0)))) 2554 return false; 2555 2556 cr0 &= ~SVM_CR0_SELECTIVE_MASK; 2557 val &= ~SVM_CR0_SELECTIVE_MASK; 2558 2559 if (cr0 ^ val) { 2560 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; 2561 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); 2562 } 2563 2564 return ret; 2565 } 2566 2567 #define CR_VALID (1ULL << 63) 2568 2569 static int cr_interception(struct kvm_vcpu *vcpu) 2570 { 2571 struct vcpu_svm *svm = to_svm(vcpu); 2572 int reg, cr; 2573 unsigned long val; 2574 int err; 2575 2576 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) 2577 return emulate_on_interception(vcpu); 2578 2579 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) 2580 return emulate_on_interception(vcpu); 2581 2582 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2583 if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) 2584 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; 2585 else 2586 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; 2587 2588 err = 0; 2589 if (cr >= 16) { /* mov to cr */ 2590 cr -= 16; 2591 val = kvm_register_read(vcpu, reg); 2592 trace_kvm_cr_write(cr, val); 2593 switch (cr) { 2594 case 0: 2595 if (!check_selective_cr0_intercepted(vcpu, val)) 2596 err = kvm_set_cr0(vcpu, val); 2597 else 2598 return 1; 2599 2600 break; 2601 case 3: 2602 err = kvm_set_cr3(vcpu, val); 2603 break; 2604 case 4: 2605 err = kvm_set_cr4(vcpu, val); 2606 break; 2607 case 8: 2608 err = kvm_set_cr8(vcpu, val); 2609 break; 2610 default: 2611 WARN(1, "unhandled write to CR%d", cr); 2612 kvm_queue_exception(vcpu, UD_VECTOR); 2613 return 1; 2614 } 2615 } else { /* mov from cr */ 2616 switch (cr) { 2617 case 0: 2618 val = kvm_read_cr0(vcpu); 2619 break; 2620 case 2: 2621 val = vcpu->arch.cr2; 2622 break; 2623 case 3: 2624 val = kvm_read_cr3(vcpu); 2625 break; 2626 case 4: 2627 val = kvm_read_cr4(vcpu); 2628 break; 2629 case 8: 2630 val = kvm_get_cr8(vcpu); 2631 break; 2632 default: 2633 WARN(1, "unhandled read from CR%d", cr); 2634 kvm_queue_exception(vcpu, UD_VECTOR); 2635 return 1; 2636 } 2637 kvm_register_write(vcpu, reg, val); 2638 trace_kvm_cr_read(cr, val); 2639 } 2640 return kvm_complete_insn_gp(vcpu, err); 2641 } 2642 2643 static int cr_trap(struct kvm_vcpu *vcpu) 2644 { 2645 struct vcpu_svm *svm = to_svm(vcpu); 2646 unsigned long old_value, new_value; 2647 unsigned int cr; 2648 int ret = 0; 2649 2650 new_value = (unsigned long)svm->vmcb->control.exit_info_1; 2651 2652 cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP; 2653 switch (cr) { 2654 case 0: 2655 old_value = kvm_read_cr0(vcpu); 2656 svm_set_cr0(vcpu, new_value); 2657 2658 kvm_post_set_cr0(vcpu, old_value, new_value); 2659 break; 2660 case 4: 2661 old_value = kvm_read_cr4(vcpu); 2662 svm_set_cr4(vcpu, new_value); 2663 2664 kvm_post_set_cr4(vcpu, old_value, new_value); 2665 break; 2666 case 8: 2667 ret = kvm_set_cr8(vcpu, new_value); 2668 break; 2669 default: 2670 WARN(1, "unhandled CR%d write trap", cr); 2671 kvm_queue_exception(vcpu, UD_VECTOR); 2672 return 1; 2673 } 2674 2675 return kvm_complete_insn_gp(vcpu, ret); 2676 } 2677 2678 static int dr_interception(struct kvm_vcpu *vcpu) 2679 { 2680 struct vcpu_svm *svm = to_svm(vcpu); 2681 int reg, dr; 2682 unsigned long val; 2683 int err = 0; 2684 2685 if (vcpu->guest_debug == 0) { 2686 /* 2687 * No more DR vmexits; force a reload of the debug registers 2688 * and reenter on this instruction. The next vmexit will 2689 * retrieve the full state of the debug registers. 2690 */ 2691 clr_dr_intercepts(svm); 2692 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 2693 return 1; 2694 } 2695 2696 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) 2697 return emulate_on_interception(vcpu); 2698 2699 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; 2700 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; 2701 if (dr >= 16) { /* mov to DRn */ 2702 dr -= 16; 2703 val = kvm_register_read(vcpu, reg); 2704 err = kvm_set_dr(vcpu, dr, val); 2705 } else { 2706 kvm_get_dr(vcpu, dr, &val); 2707 kvm_register_write(vcpu, reg, val); 2708 } 2709 2710 return kvm_complete_insn_gp(vcpu, err); 2711 } 2712 2713 static int cr8_write_interception(struct kvm_vcpu *vcpu) 2714 { 2715 int r; 2716 2717 u8 cr8_prev = kvm_get_cr8(vcpu); 2718 /* instruction emulation calls kvm_set_cr8() */ 2719 r = cr_interception(vcpu); 2720 if (lapic_in_kernel(vcpu)) 2721 return r; 2722 if (cr8_prev <= kvm_get_cr8(vcpu)) 2723 return r; 2724 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 2725 return 0; 2726 } 2727 2728 static int efer_trap(struct kvm_vcpu *vcpu) 2729 { 2730 struct msr_data msr_info; 2731 int ret; 2732 2733 /* 2734 * Clear the EFER_SVME bit from EFER. The SVM code always sets this 2735 * bit in svm_set_efer(), but __kvm_valid_efer() checks it against 2736 * whether the guest has X86_FEATURE_SVM - this avoids a failure if 2737 * the guest doesn't have X86_FEATURE_SVM. 2738 */ 2739 msr_info.host_initiated = false; 2740 msr_info.index = MSR_EFER; 2741 msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME; 2742 ret = kvm_set_msr_common(vcpu, &msr_info); 2743 2744 return kvm_complete_insn_gp(vcpu, ret); 2745 } 2746 2747 static int svm_get_msr_feature(struct kvm_msr_entry *msr) 2748 { 2749 msr->data = 0; 2750 2751 switch (msr->index) { 2752 case MSR_F10H_DECFG: 2753 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) 2754 msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; 2755 break; 2756 case MSR_IA32_PERF_CAPABILITIES: 2757 return 0; 2758 default: 2759 return KVM_MSR_RET_INVALID; 2760 } 2761 2762 return 0; 2763 } 2764 2765 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2766 { 2767 struct vcpu_svm *svm = to_svm(vcpu); 2768 2769 switch (msr_info->index) { 2770 case MSR_AMD64_TSC_RATIO: 2771 if (!msr_info->host_initiated && !svm->tsc_scaling_enabled) 2772 return 1; 2773 msr_info->data = svm->tsc_ratio_msr; 2774 break; 2775 case MSR_STAR: 2776 msr_info->data = svm->vmcb01.ptr->save.star; 2777 break; 2778 #ifdef CONFIG_X86_64 2779 case MSR_LSTAR: 2780 msr_info->data = svm->vmcb01.ptr->save.lstar; 2781 break; 2782 case MSR_CSTAR: 2783 msr_info->data = svm->vmcb01.ptr->save.cstar; 2784 break; 2785 case MSR_KERNEL_GS_BASE: 2786 msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; 2787 break; 2788 case MSR_SYSCALL_MASK: 2789 msr_info->data = svm->vmcb01.ptr->save.sfmask; 2790 break; 2791 #endif 2792 case MSR_IA32_SYSENTER_CS: 2793 msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; 2794 break; 2795 case MSR_IA32_SYSENTER_EIP: 2796 msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; 2797 if (guest_cpuid_is_intel(vcpu)) 2798 msr_info->data |= (u64)svm->sysenter_eip_hi << 32; 2799 break; 2800 case MSR_IA32_SYSENTER_ESP: 2801 msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; 2802 if (guest_cpuid_is_intel(vcpu)) 2803 msr_info->data |= (u64)svm->sysenter_esp_hi << 32; 2804 break; 2805 case MSR_TSC_AUX: 2806 msr_info->data = svm->tsc_aux; 2807 break; 2808 /* 2809 * Nobody will change the following 5 values in the VMCB so we can 2810 * safely return them on rdmsr. They will always be 0 until LBRV is 2811 * implemented. 2812 */ 2813 case MSR_IA32_DEBUGCTLMSR: 2814 msr_info->data = svm->vmcb->save.dbgctl; 2815 break; 2816 case MSR_IA32_LASTBRANCHFROMIP: 2817 msr_info->data = svm->vmcb->save.br_from; 2818 break; 2819 case MSR_IA32_LASTBRANCHTOIP: 2820 msr_info->data = svm->vmcb->save.br_to; 2821 break; 2822 case MSR_IA32_LASTINTFROMIP: 2823 msr_info->data = svm->vmcb->save.last_excp_from; 2824 break; 2825 case MSR_IA32_LASTINTTOIP: 2826 msr_info->data = svm->vmcb->save.last_excp_to; 2827 break; 2828 case MSR_VM_HSAVE_PA: 2829 msr_info->data = svm->nested.hsave_msr; 2830 break; 2831 case MSR_VM_CR: 2832 msr_info->data = svm->nested.vm_cr_msr; 2833 break; 2834 case MSR_IA32_SPEC_CTRL: 2835 if (!msr_info->host_initiated && 2836 !guest_has_spec_ctrl_msr(vcpu)) 2837 return 1; 2838 2839 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2840 msr_info->data = svm->vmcb->save.spec_ctrl; 2841 else 2842 msr_info->data = svm->spec_ctrl; 2843 break; 2844 case MSR_AMD64_VIRT_SPEC_CTRL: 2845 if (!msr_info->host_initiated && 2846 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2847 return 1; 2848 2849 msr_info->data = svm->virt_spec_ctrl; 2850 break; 2851 case MSR_F15H_IC_CFG: { 2852 2853 int family, model; 2854 2855 family = guest_cpuid_family(vcpu); 2856 model = guest_cpuid_model(vcpu); 2857 2858 if (family < 0 || model < 0) 2859 return kvm_get_msr_common(vcpu, msr_info); 2860 2861 msr_info->data = 0; 2862 2863 if (family == 0x15 && 2864 (model >= 0x2 && model < 0x20)) 2865 msr_info->data = 0x1E; 2866 } 2867 break; 2868 case MSR_F10H_DECFG: 2869 msr_info->data = svm->msr_decfg; 2870 break; 2871 default: 2872 return kvm_get_msr_common(vcpu, msr_info); 2873 } 2874 return 0; 2875 } 2876 2877 static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) 2878 { 2879 struct vcpu_svm *svm = to_svm(vcpu); 2880 if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) 2881 return kvm_complete_insn_gp(vcpu, err); 2882 2883 ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); 2884 ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 2885 X86_TRAP_GP | 2886 SVM_EVTINJ_TYPE_EXEPT | 2887 SVM_EVTINJ_VALID); 2888 return 1; 2889 } 2890 2891 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) 2892 { 2893 struct vcpu_svm *svm = to_svm(vcpu); 2894 int svm_dis, chg_mask; 2895 2896 if (data & ~SVM_VM_CR_VALID_MASK) 2897 return 1; 2898 2899 chg_mask = SVM_VM_CR_VALID_MASK; 2900 2901 if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) 2902 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); 2903 2904 svm->nested.vm_cr_msr &= ~chg_mask; 2905 svm->nested.vm_cr_msr |= (data & chg_mask); 2906 2907 svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; 2908 2909 /* check for svm_disable while efer.svme is set */ 2910 if (svm_dis && (vcpu->arch.efer & EFER_SVME)) 2911 return 1; 2912 2913 return 0; 2914 } 2915 2916 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 2917 { 2918 struct vcpu_svm *svm = to_svm(vcpu); 2919 int r; 2920 2921 u32 ecx = msr->index; 2922 u64 data = msr->data; 2923 switch (ecx) { 2924 case MSR_AMD64_TSC_RATIO: 2925 if (!msr->host_initiated && !svm->tsc_scaling_enabled) 2926 return 1; 2927 2928 if (data & TSC_RATIO_RSVD) 2929 return 1; 2930 2931 svm->tsc_ratio_msr = data; 2932 2933 if (svm->tsc_scaling_enabled && is_guest_mode(vcpu)) 2934 nested_svm_update_tsc_ratio_msr(vcpu); 2935 2936 break; 2937 case MSR_IA32_CR_PAT: 2938 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 2939 return 1; 2940 vcpu->arch.pat = data; 2941 svm->vmcb01.ptr->save.g_pat = data; 2942 if (is_guest_mode(vcpu)) 2943 nested_vmcb02_compute_g_pat(svm); 2944 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 2945 break; 2946 case MSR_IA32_SPEC_CTRL: 2947 if (!msr->host_initiated && 2948 !guest_has_spec_ctrl_msr(vcpu)) 2949 return 1; 2950 2951 if (kvm_spec_ctrl_test_value(data)) 2952 return 1; 2953 2954 if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 2955 svm->vmcb->save.spec_ctrl = data; 2956 else 2957 svm->spec_ctrl = data; 2958 if (!data) 2959 break; 2960 2961 /* 2962 * For non-nested: 2963 * When it's written (to non-zero) for the first time, pass 2964 * it through. 2965 * 2966 * For nested: 2967 * The handling of the MSR bitmap for L2 guests is done in 2968 * nested_svm_vmrun_msrpm. 2969 * We update the L1 MSR bit as well since it will end up 2970 * touching the MSR anyway now. 2971 */ 2972 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); 2973 break; 2974 case MSR_IA32_PRED_CMD: 2975 if (!msr->host_initiated && 2976 !guest_has_pred_cmd_msr(vcpu)) 2977 return 1; 2978 2979 if (data & ~PRED_CMD_IBPB) 2980 return 1; 2981 if (!boot_cpu_has(X86_FEATURE_IBPB)) 2982 return 1; 2983 if (!data) 2984 break; 2985 2986 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB); 2987 set_msr_interception(vcpu, svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); 2988 break; 2989 case MSR_AMD64_VIRT_SPEC_CTRL: 2990 if (!msr->host_initiated && 2991 !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) 2992 return 1; 2993 2994 if (data & ~SPEC_CTRL_SSBD) 2995 return 1; 2996 2997 svm->virt_spec_ctrl = data; 2998 break; 2999 case MSR_STAR: 3000 svm->vmcb01.ptr->save.star = data; 3001 break; 3002 #ifdef CONFIG_X86_64 3003 case MSR_LSTAR: 3004 svm->vmcb01.ptr->save.lstar = data; 3005 break; 3006 case MSR_CSTAR: 3007 svm->vmcb01.ptr->save.cstar = data; 3008 break; 3009 case MSR_KERNEL_GS_BASE: 3010 svm->vmcb01.ptr->save.kernel_gs_base = data; 3011 break; 3012 case MSR_SYSCALL_MASK: 3013 svm->vmcb01.ptr->save.sfmask = data; 3014 break; 3015 #endif 3016 case MSR_IA32_SYSENTER_CS: 3017 svm->vmcb01.ptr->save.sysenter_cs = data; 3018 break; 3019 case MSR_IA32_SYSENTER_EIP: 3020 svm->vmcb01.ptr->save.sysenter_eip = (u32)data; 3021 /* 3022 * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs 3023 * when we spoof an Intel vendor ID (for cross vendor migration). 3024 * In this case we use this intercept to track the high 3025 * 32 bit part of these msrs to support Intel's 3026 * implementation of SYSENTER/SYSEXIT. 3027 */ 3028 svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; 3029 break; 3030 case MSR_IA32_SYSENTER_ESP: 3031 svm->vmcb01.ptr->save.sysenter_esp = (u32)data; 3032 svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; 3033 break; 3034 case MSR_TSC_AUX: 3035 /* 3036 * TSC_AUX is usually changed only during boot and never read 3037 * directly. Intercept TSC_AUX instead of exposing it to the 3038 * guest via direct_access_msrs, and switch it via user return. 3039 */ 3040 preempt_disable(); 3041 r = kvm_set_user_return_msr(tsc_aux_uret_slot, data, -1ull); 3042 preempt_enable(); 3043 if (r) 3044 return 1; 3045 3046 svm->tsc_aux = data; 3047 break; 3048 case MSR_IA32_DEBUGCTLMSR: 3049 if (!lbrv) { 3050 vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3051 __func__, data); 3052 break; 3053 } 3054 if (data & DEBUGCTL_RESERVED_BITS) 3055 return 1; 3056 3057 svm->vmcb->save.dbgctl = data; 3058 vmcb_mark_dirty(svm->vmcb, VMCB_LBR); 3059 if (data & (1ULL<<0)) 3060 svm_enable_lbrv(vcpu); 3061 else 3062 svm_disable_lbrv(vcpu); 3063 break; 3064 case MSR_VM_HSAVE_PA: 3065 /* 3066 * Old kernels did not validate the value written to 3067 * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid 3068 * value to allow live migrating buggy or malicious guests 3069 * originating from those kernels. 3070 */ 3071 if (!msr->host_initiated && !page_address_valid(vcpu, data)) 3072 return 1; 3073 3074 svm->nested.hsave_msr = data & PAGE_MASK; 3075 break; 3076 case MSR_VM_CR: 3077 return svm_set_vm_cr(vcpu, data); 3078 case MSR_VM_IGNNE: 3079 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3080 break; 3081 case MSR_F10H_DECFG: { 3082 struct kvm_msr_entry msr_entry; 3083 3084 msr_entry.index = msr->index; 3085 if (svm_get_msr_feature(&msr_entry)) 3086 return 1; 3087 3088 /* Check the supported bits */ 3089 if (data & ~msr_entry.data) 3090 return 1; 3091 3092 /* Don't allow the guest to change a bit, #GP */ 3093 if (!msr->host_initiated && (data ^ msr_entry.data)) 3094 return 1; 3095 3096 svm->msr_decfg = data; 3097 break; 3098 } 3099 default: 3100 return kvm_set_msr_common(vcpu, msr); 3101 } 3102 return 0; 3103 } 3104 3105 static int msr_interception(struct kvm_vcpu *vcpu) 3106 { 3107 if (to_svm(vcpu)->vmcb->control.exit_info_1) 3108 return kvm_emulate_wrmsr(vcpu); 3109 else 3110 return kvm_emulate_rdmsr(vcpu); 3111 } 3112 3113 static int interrupt_window_interception(struct kvm_vcpu *vcpu) 3114 { 3115 kvm_make_request(KVM_REQ_EVENT, vcpu); 3116 svm_clear_vintr(to_svm(vcpu)); 3117 3118 /* 3119 * For AVIC, the only reason to end up here is ExtINTs. 3120 * In this case AVIC was temporarily disabled for 3121 * requesting the IRQ window and we have to re-enable it. 3122 */ 3123 kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN); 3124 3125 ++vcpu->stat.irq_window_exits; 3126 return 1; 3127 } 3128 3129 static int pause_interception(struct kvm_vcpu *vcpu) 3130 { 3131 bool in_kernel; 3132 3133 /* 3134 * CPL is not made available for an SEV-ES guest, therefore 3135 * vcpu->arch.preempted_in_kernel can never be true. Just 3136 * set in_kernel to false as well. 3137 */ 3138 in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0; 3139 3140 if (!kvm_pause_in_guest(vcpu->kvm)) 3141 grow_ple_window(vcpu); 3142 3143 kvm_vcpu_on_spin(vcpu, in_kernel); 3144 return kvm_skip_emulated_instruction(vcpu); 3145 } 3146 3147 static int invpcid_interception(struct kvm_vcpu *vcpu) 3148 { 3149 struct vcpu_svm *svm = to_svm(vcpu); 3150 unsigned long type; 3151 gva_t gva; 3152 3153 if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { 3154 kvm_queue_exception(vcpu, UD_VECTOR); 3155 return 1; 3156 } 3157 3158 /* 3159 * For an INVPCID intercept: 3160 * EXITINFO1 provides the linear address of the memory operand. 3161 * EXITINFO2 provides the contents of the register operand. 3162 */ 3163 type = svm->vmcb->control.exit_info_2; 3164 gva = svm->vmcb->control.exit_info_1; 3165 3166 return kvm_handle_invpcid(vcpu, type, gva); 3167 } 3168 3169 static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { 3170 [SVM_EXIT_READ_CR0] = cr_interception, 3171 [SVM_EXIT_READ_CR3] = cr_interception, 3172 [SVM_EXIT_READ_CR4] = cr_interception, 3173 [SVM_EXIT_READ_CR8] = cr_interception, 3174 [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, 3175 [SVM_EXIT_WRITE_CR0] = cr_interception, 3176 [SVM_EXIT_WRITE_CR3] = cr_interception, 3177 [SVM_EXIT_WRITE_CR4] = cr_interception, 3178 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3179 [SVM_EXIT_READ_DR0] = dr_interception, 3180 [SVM_EXIT_READ_DR1] = dr_interception, 3181 [SVM_EXIT_READ_DR2] = dr_interception, 3182 [SVM_EXIT_READ_DR3] = dr_interception, 3183 [SVM_EXIT_READ_DR4] = dr_interception, 3184 [SVM_EXIT_READ_DR5] = dr_interception, 3185 [SVM_EXIT_READ_DR6] = dr_interception, 3186 [SVM_EXIT_READ_DR7] = dr_interception, 3187 [SVM_EXIT_WRITE_DR0] = dr_interception, 3188 [SVM_EXIT_WRITE_DR1] = dr_interception, 3189 [SVM_EXIT_WRITE_DR2] = dr_interception, 3190 [SVM_EXIT_WRITE_DR3] = dr_interception, 3191 [SVM_EXIT_WRITE_DR4] = dr_interception, 3192 [SVM_EXIT_WRITE_DR5] = dr_interception, 3193 [SVM_EXIT_WRITE_DR6] = dr_interception, 3194 [SVM_EXIT_WRITE_DR7] = dr_interception, 3195 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3196 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3197 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3198 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 3199 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 3200 [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, 3201 [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, 3202 [SVM_EXIT_INTR] = intr_interception, 3203 [SVM_EXIT_NMI] = nmi_interception, 3204 [SVM_EXIT_SMI] = smi_interception, 3205 [SVM_EXIT_VINTR] = interrupt_window_interception, 3206 [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, 3207 [SVM_EXIT_CPUID] = kvm_emulate_cpuid, 3208 [SVM_EXIT_IRET] = iret_interception, 3209 [SVM_EXIT_INVD] = kvm_emulate_invd, 3210 [SVM_EXIT_PAUSE] = pause_interception, 3211 [SVM_EXIT_HLT] = kvm_emulate_halt, 3212 [SVM_EXIT_INVLPG] = invlpg_interception, 3213 [SVM_EXIT_INVLPGA] = invlpga_interception, 3214 [SVM_EXIT_IOIO] = io_interception, 3215 [SVM_EXIT_MSR] = msr_interception, 3216 [SVM_EXIT_TASK_SWITCH] = task_switch_interception, 3217 [SVM_EXIT_SHUTDOWN] = shutdown_interception, 3218 [SVM_EXIT_VMRUN] = vmrun_interception, 3219 [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, 3220 [SVM_EXIT_VMLOAD] = vmload_interception, 3221 [SVM_EXIT_VMSAVE] = vmsave_interception, 3222 [SVM_EXIT_STGI] = stgi_interception, 3223 [SVM_EXIT_CLGI] = clgi_interception, 3224 [SVM_EXIT_SKINIT] = skinit_interception, 3225 [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, 3226 [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, 3227 [SVM_EXIT_MONITOR] = kvm_emulate_monitor, 3228 [SVM_EXIT_MWAIT] = kvm_emulate_mwait, 3229 [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, 3230 [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, 3231 [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, 3232 [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, 3233 [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, 3234 [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, 3235 [SVM_EXIT_INVPCID] = invpcid_interception, 3236 [SVM_EXIT_NPF] = npf_interception, 3237 [SVM_EXIT_RSM] = rsm_interception, 3238 [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, 3239 [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, 3240 [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, 3241 }; 3242 3243 static void dump_vmcb(struct kvm_vcpu *vcpu) 3244 { 3245 struct vcpu_svm *svm = to_svm(vcpu); 3246 struct vmcb_control_area *control = &svm->vmcb->control; 3247 struct vmcb_save_area *save = &svm->vmcb->save; 3248 struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; 3249 3250 if (!dump_invalid_vmcb) { 3251 pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n"); 3252 return; 3253 } 3254 3255 pr_err("VMCB %p, last attempted VMRUN on CPU %d\n", 3256 svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); 3257 pr_err("VMCB Control Area:\n"); 3258 pr_err("%-20s%04x\n", "cr_read:", control->intercepts[INTERCEPT_CR] & 0xffff); 3259 pr_err("%-20s%04x\n", "cr_write:", control->intercepts[INTERCEPT_CR] >> 16); 3260 pr_err("%-20s%04x\n", "dr_read:", control->intercepts[INTERCEPT_DR] & 0xffff); 3261 pr_err("%-20s%04x\n", "dr_write:", control->intercepts[INTERCEPT_DR] >> 16); 3262 pr_err("%-20s%08x\n", "exceptions:", control->intercepts[INTERCEPT_EXCEPTION]); 3263 pr_err("%-20s%08x %08x\n", "intercepts:", 3264 control->intercepts[INTERCEPT_WORD3], 3265 control->intercepts[INTERCEPT_WORD4]); 3266 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); 3267 pr_err("%-20s%d\n", "pause filter threshold:", 3268 control->pause_filter_thresh); 3269 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); 3270 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); 3271 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); 3272 pr_err("%-20s%d\n", "asid:", control->asid); 3273 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); 3274 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); 3275 pr_err("%-20s%08x\n", "int_vector:", control->int_vector); 3276 pr_err("%-20s%08x\n", "int_state:", control->int_state); 3277 pr_err("%-20s%08x\n", "exit_code:", control->exit_code); 3278 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); 3279 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); 3280 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); 3281 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); 3282 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); 3283 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); 3284 pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); 3285 pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); 3286 pr_err("%-20s%08x\n", "event_inj:", control->event_inj); 3287 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); 3288 pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); 3289 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); 3290 pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); 3291 pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); 3292 pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); 3293 pr_err("%-20s%016llx\n", "vmsa_pa:", control->vmsa_pa); 3294 pr_err("VMCB State Save Area:\n"); 3295 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3296 "es:", 3297 save->es.selector, save->es.attrib, 3298 save->es.limit, save->es.base); 3299 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3300 "cs:", 3301 save->cs.selector, save->cs.attrib, 3302 save->cs.limit, save->cs.base); 3303 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3304 "ss:", 3305 save->ss.selector, save->ss.attrib, 3306 save->ss.limit, save->ss.base); 3307 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3308 "ds:", 3309 save->ds.selector, save->ds.attrib, 3310 save->ds.limit, save->ds.base); 3311 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3312 "fs:", 3313 save01->fs.selector, save01->fs.attrib, 3314 save01->fs.limit, save01->fs.base); 3315 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3316 "gs:", 3317 save01->gs.selector, save01->gs.attrib, 3318 save01->gs.limit, save01->gs.base); 3319 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3320 "gdtr:", 3321 save->gdtr.selector, save->gdtr.attrib, 3322 save->gdtr.limit, save->gdtr.base); 3323 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3324 "ldtr:", 3325 save01->ldtr.selector, save01->ldtr.attrib, 3326 save01->ldtr.limit, save01->ldtr.base); 3327 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3328 "idtr:", 3329 save->idtr.selector, save->idtr.attrib, 3330 save->idtr.limit, save->idtr.base); 3331 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", 3332 "tr:", 3333 save01->tr.selector, save01->tr.attrib, 3334 save01->tr.limit, save01->tr.base); 3335 pr_err("cpl: %d efer: %016llx\n", 3336 save->cpl, save->efer); 3337 pr_err("%-15s %016llx %-13s %016llx\n", 3338 "cr0:", save->cr0, "cr2:", save->cr2); 3339 pr_err("%-15s %016llx %-13s %016llx\n", 3340 "cr3:", save->cr3, "cr4:", save->cr4); 3341 pr_err("%-15s %016llx %-13s %016llx\n", 3342 "dr6:", save->dr6, "dr7:", save->dr7); 3343 pr_err("%-15s %016llx %-13s %016llx\n", 3344 "rip:", save->rip, "rflags:", save->rflags); 3345 pr_err("%-15s %016llx %-13s %016llx\n", 3346 "rsp:", save->rsp, "rax:", save->rax); 3347 pr_err("%-15s %016llx %-13s %016llx\n", 3348 "star:", save01->star, "lstar:", save01->lstar); 3349 pr_err("%-15s %016llx %-13s %016llx\n", 3350 "cstar:", save01->cstar, "sfmask:", save01->sfmask); 3351 pr_err("%-15s %016llx %-13s %016llx\n", 3352 "kernel_gs_base:", save01->kernel_gs_base, 3353 "sysenter_cs:", save01->sysenter_cs); 3354 pr_err("%-15s %016llx %-13s %016llx\n", 3355 "sysenter_esp:", save01->sysenter_esp, 3356 "sysenter_eip:", save01->sysenter_eip); 3357 pr_err("%-15s %016llx %-13s %016llx\n", 3358 "gpat:", save->g_pat, "dbgctl:", save->dbgctl); 3359 pr_err("%-15s %016llx %-13s %016llx\n", 3360 "br_from:", save->br_from, "br_to:", save->br_to); 3361 pr_err("%-15s %016llx %-13s %016llx\n", 3362 "excp_from:", save->last_excp_from, 3363 "excp_to:", save->last_excp_to); 3364 } 3365 3366 static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) 3367 { 3368 return (exit_code < ARRAY_SIZE(svm_exit_handlers) && 3369 svm_exit_handlers[exit_code]); 3370 } 3371 3372 static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) 3373 { 3374 vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); 3375 dump_vmcb(vcpu); 3376 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3377 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; 3378 vcpu->run->internal.ndata = 2; 3379 vcpu->run->internal.data[0] = exit_code; 3380 vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; 3381 return 0; 3382 } 3383 3384 int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) 3385 { 3386 if (!svm_check_exit_valid(vcpu, exit_code)) 3387 return svm_handle_invalid_exit(vcpu, exit_code); 3388 3389 #ifdef CONFIG_RETPOLINE 3390 if (exit_code == SVM_EXIT_MSR) 3391 return msr_interception(vcpu); 3392 else if (exit_code == SVM_EXIT_VINTR) 3393 return interrupt_window_interception(vcpu); 3394 else if (exit_code == SVM_EXIT_INTR) 3395 return intr_interception(vcpu); 3396 else if (exit_code == SVM_EXIT_HLT) 3397 return kvm_emulate_halt(vcpu); 3398 else if (exit_code == SVM_EXIT_NPF) 3399 return npf_interception(vcpu); 3400 #endif 3401 return svm_exit_handlers[exit_code](vcpu); 3402 } 3403 3404 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, 3405 u64 *info1, u64 *info2, 3406 u32 *intr_info, u32 *error_code) 3407 { 3408 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; 3409 3410 *reason = control->exit_code; 3411 *info1 = control->exit_info_1; 3412 *info2 = control->exit_info_2; 3413 *intr_info = control->exit_int_info; 3414 if ((*intr_info & SVM_EXITINTINFO_VALID) && 3415 (*intr_info & SVM_EXITINTINFO_VALID_ERR)) 3416 *error_code = control->exit_int_info_err; 3417 else 3418 *error_code = 0; 3419 } 3420 3421 static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) 3422 { 3423 struct vcpu_svm *svm = to_svm(vcpu); 3424 struct kvm_run *kvm_run = vcpu->run; 3425 u32 exit_code = svm->vmcb->control.exit_code; 3426 3427 trace_kvm_exit(vcpu, KVM_ISA_SVM); 3428 3429 /* SEV-ES guests must use the CR write traps to track CR registers. */ 3430 if (!sev_es_guest(vcpu->kvm)) { 3431 if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) 3432 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3433 if (npt_enabled) 3434 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3435 } 3436 3437 if (is_guest_mode(vcpu)) { 3438 int vmexit; 3439 3440 trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); 3441 3442 vmexit = nested_svm_exit_special(svm); 3443 3444 if (vmexit == NESTED_EXIT_CONTINUE) 3445 vmexit = nested_svm_exit_handled(svm); 3446 3447 if (vmexit == NESTED_EXIT_DONE) 3448 return 1; 3449 } 3450 3451 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 3452 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3453 kvm_run->fail_entry.hardware_entry_failure_reason 3454 = svm->vmcb->control.exit_code; 3455 kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; 3456 dump_vmcb(vcpu); 3457 return 0; 3458 } 3459 3460 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 3461 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3462 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && 3463 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) 3464 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x " 3465 "exit_code 0x%x\n", 3466 __func__, svm->vmcb->control.exit_int_info, 3467 exit_code); 3468 3469 if (exit_fastpath != EXIT_FASTPATH_NONE) 3470 return 1; 3471 3472 return svm_invoke_exit_handler(vcpu, exit_code); 3473 } 3474 3475 static void reload_tss(struct kvm_vcpu *vcpu) 3476 { 3477 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3478 3479 sd->tss_desc->type = 9; /* available 32/64-bit TSS */ 3480 load_TR_desc(); 3481 } 3482 3483 static void pre_svm_run(struct kvm_vcpu *vcpu) 3484 { 3485 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3486 struct vcpu_svm *svm = to_svm(vcpu); 3487 3488 /* 3489 * If the previous vmrun of the vmcb occurred on a different physical 3490 * cpu, then mark the vmcb dirty and assign a new asid. Hardware's 3491 * vmcb clean bits are per logical CPU, as are KVM's asid assignments. 3492 */ 3493 if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) { 3494 svm->current_vmcb->asid_generation = 0; 3495 vmcb_mark_all_dirty(svm->vmcb); 3496 svm->current_vmcb->cpu = vcpu->cpu; 3497 } 3498 3499 if (sev_guest(vcpu->kvm)) 3500 return pre_sev_run(svm, vcpu->cpu); 3501 3502 /* FIXME: handle wraparound of asid_generation */ 3503 if (svm->current_vmcb->asid_generation != sd->asid_generation) 3504 new_asid(svm, sd); 3505 } 3506 3507 static void svm_inject_nmi(struct kvm_vcpu *vcpu) 3508 { 3509 struct vcpu_svm *svm = to_svm(vcpu); 3510 3511 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3512 vcpu->arch.hflags |= HF_NMI_MASK; 3513 if (!sev_es_guest(vcpu->kvm)) 3514 svm_set_intercept(svm, INTERCEPT_IRET); 3515 ++vcpu->stat.nmi_injections; 3516 } 3517 3518 static void svm_set_irq(struct kvm_vcpu *vcpu) 3519 { 3520 struct vcpu_svm *svm = to_svm(vcpu); 3521 3522 BUG_ON(!(gif_set(svm))); 3523 3524 trace_kvm_inj_virq(vcpu->arch.interrupt.nr); 3525 ++vcpu->stat.irq_injections; 3526 3527 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 3528 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; 3529 } 3530 3531 static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 3532 { 3533 struct vcpu_svm *svm = to_svm(vcpu); 3534 3535 /* 3536 * SEV-ES guests must always keep the CR intercepts cleared. CR 3537 * tracking is done using the CR write traps. 3538 */ 3539 if (sev_es_guest(vcpu->kvm)) 3540 return; 3541 3542 if (nested_svm_virtualize_tpr(vcpu)) 3543 return; 3544 3545 svm_clr_intercept(svm, INTERCEPT_CR8_WRITE); 3546 3547 if (irr == -1) 3548 return; 3549 3550 if (tpr >= irr) 3551 svm_set_intercept(svm, INTERCEPT_CR8_WRITE); 3552 } 3553 3554 bool svm_nmi_blocked(struct kvm_vcpu *vcpu) 3555 { 3556 struct vcpu_svm *svm = to_svm(vcpu); 3557 struct vmcb *vmcb = svm->vmcb; 3558 bool ret; 3559 3560 if (!gif_set(svm)) 3561 return true; 3562 3563 if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3564 return false; 3565 3566 ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || 3567 (vcpu->arch.hflags & HF_NMI_MASK); 3568 3569 return ret; 3570 } 3571 3572 static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3573 { 3574 struct vcpu_svm *svm = to_svm(vcpu); 3575 if (svm->nested.nested_run_pending) 3576 return -EBUSY; 3577 3578 /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ 3579 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) 3580 return -EBUSY; 3581 3582 return !svm_nmi_blocked(vcpu); 3583 } 3584 3585 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) 3586 { 3587 return !!(vcpu->arch.hflags & HF_NMI_MASK); 3588 } 3589 3590 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 3591 { 3592 struct vcpu_svm *svm = to_svm(vcpu); 3593 3594 if (masked) { 3595 vcpu->arch.hflags |= HF_NMI_MASK; 3596 if (!sev_es_guest(vcpu->kvm)) 3597 svm_set_intercept(svm, INTERCEPT_IRET); 3598 } else { 3599 vcpu->arch.hflags &= ~HF_NMI_MASK; 3600 if (!sev_es_guest(vcpu->kvm)) 3601 svm_clr_intercept(svm, INTERCEPT_IRET); 3602 } 3603 } 3604 3605 bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) 3606 { 3607 struct vcpu_svm *svm = to_svm(vcpu); 3608 struct vmcb *vmcb = svm->vmcb; 3609 3610 if (!gif_set(svm)) 3611 return true; 3612 3613 if (is_guest_mode(vcpu)) { 3614 /* As long as interrupts are being delivered... */ 3615 if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) 3616 ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) 3617 : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) 3618 return true; 3619 3620 /* ... vmexits aren't blocked by the interrupt shadow */ 3621 if (nested_exit_on_intr(svm)) 3622 return false; 3623 } else { 3624 if (!svm_get_if_flag(vcpu)) 3625 return true; 3626 } 3627 3628 return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); 3629 } 3630 3631 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) 3632 { 3633 struct vcpu_svm *svm = to_svm(vcpu); 3634 if (svm->nested.nested_run_pending) 3635 return -EBUSY; 3636 3637 /* 3638 * An IRQ must not be injected into L2 if it's supposed to VM-Exit, 3639 * e.g. if the IRQ arrived asynchronously after checking nested events. 3640 */ 3641 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) 3642 return -EBUSY; 3643 3644 return !svm_interrupt_blocked(vcpu); 3645 } 3646 3647 static void svm_enable_irq_window(struct kvm_vcpu *vcpu) 3648 { 3649 struct vcpu_svm *svm = to_svm(vcpu); 3650 3651 /* 3652 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes 3653 * 1, because that's a separate STGI/VMRUN intercept. The next time we 3654 * get that intercept, this function will be called again though and 3655 * we'll get the vintr intercept. However, if the vGIF feature is 3656 * enabled, the STGI interception will not occur. Enable the irq 3657 * window under the assumption that the hardware will set the GIF. 3658 */ 3659 if (vgif_enabled(svm) || gif_set(svm)) { 3660 /* 3661 * IRQ window is not needed when AVIC is enabled, 3662 * unless we have pending ExtINT since it cannot be injected 3663 * via AVIC. In such case, we need to temporarily disable AVIC, 3664 * and fallback to injecting IRQ via V_IRQ. 3665 */ 3666 kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN); 3667 svm_set_vintr(svm); 3668 } 3669 } 3670 3671 static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) 3672 { 3673 struct vcpu_svm *svm = to_svm(vcpu); 3674 3675 if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK) 3676 return; /* IRET will cause a vm exit */ 3677 3678 if (!gif_set(svm)) { 3679 if (vgif_enabled(svm)) 3680 svm_set_intercept(svm, INTERCEPT_STGI); 3681 return; /* STGI will cause a vm exit */ 3682 } 3683 3684 /* 3685 * Something prevents NMI from been injected. Single step over possible 3686 * problem (IRET or exception injection or interrupt shadow) 3687 */ 3688 svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); 3689 svm->nmi_singlestep = true; 3690 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 3691 } 3692 3693 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) 3694 { 3695 return 0; 3696 } 3697 3698 static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) 3699 { 3700 return 0; 3701 } 3702 3703 void svm_flush_tlb(struct kvm_vcpu *vcpu) 3704 { 3705 struct vcpu_svm *svm = to_svm(vcpu); 3706 3707 /* 3708 * Flush only the current ASID even if the TLB flush was invoked via 3709 * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all 3710 * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and 3711 * unconditionally does a TLB flush on both nested VM-Enter and nested 3712 * VM-Exit (via kvm_mmu_reset_context()). 3713 */ 3714 if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) 3715 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; 3716 else 3717 svm->current_vmcb->asid_generation--; 3718 } 3719 3720 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) 3721 { 3722 struct vcpu_svm *svm = to_svm(vcpu); 3723 3724 invlpga(gva, svm->vmcb->control.asid); 3725 } 3726 3727 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) 3728 { 3729 struct vcpu_svm *svm = to_svm(vcpu); 3730 3731 if (nested_svm_virtualize_tpr(vcpu)) 3732 return; 3733 3734 if (!svm_is_intercept(svm, INTERCEPT_CR8_WRITE)) { 3735 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3736 kvm_set_cr8(vcpu, cr8); 3737 } 3738 } 3739 3740 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) 3741 { 3742 struct vcpu_svm *svm = to_svm(vcpu); 3743 u64 cr8; 3744 3745 if (nested_svm_virtualize_tpr(vcpu) || 3746 kvm_vcpu_apicv_active(vcpu)) 3747 return; 3748 3749 cr8 = kvm_get_cr8(vcpu); 3750 svm->vmcb->control.int_ctl &= ~V_TPR_MASK; 3751 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 3752 } 3753 3754 static void svm_complete_interrupts(struct kvm_vcpu *vcpu) 3755 { 3756 struct vcpu_svm *svm = to_svm(vcpu); 3757 u8 vector; 3758 int type; 3759 u32 exitintinfo = svm->vmcb->control.exit_int_info; 3760 unsigned int3_injected = svm->int3_injected; 3761 3762 svm->int3_injected = 0; 3763 3764 /* 3765 * If we've made progress since setting HF_IRET_MASK, we've 3766 * executed an IRET and can allow NMI injection. 3767 */ 3768 if ((vcpu->arch.hflags & HF_IRET_MASK) && 3769 (sev_es_guest(vcpu->kvm) || 3770 kvm_rip_read(vcpu) != svm->nmi_iret_rip)) { 3771 vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3772 kvm_make_request(KVM_REQ_EVENT, vcpu); 3773 } 3774 3775 vcpu->arch.nmi_injected = false; 3776 kvm_clear_exception_queue(vcpu); 3777 kvm_clear_interrupt_queue(vcpu); 3778 3779 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 3780 return; 3781 3782 kvm_make_request(KVM_REQ_EVENT, vcpu); 3783 3784 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 3785 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 3786 3787 switch (type) { 3788 case SVM_EXITINTINFO_TYPE_NMI: 3789 vcpu->arch.nmi_injected = true; 3790 break; 3791 case SVM_EXITINTINFO_TYPE_EXEPT: 3792 /* 3793 * Never re-inject a #VC exception. 3794 */ 3795 if (vector == X86_TRAP_VC) 3796 break; 3797 3798 /* 3799 * In case of software exceptions, do not reinject the vector, 3800 * but re-execute the instruction instead. Rewind RIP first 3801 * if we emulated INT3 before. 3802 */ 3803 if (kvm_exception_is_soft(vector)) { 3804 if (vector == BP_VECTOR && int3_injected && 3805 kvm_is_linear_rip(vcpu, svm->int3_rip)) 3806 kvm_rip_write(vcpu, 3807 kvm_rip_read(vcpu) - int3_injected); 3808 break; 3809 } 3810 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { 3811 u32 err = svm->vmcb->control.exit_int_info_err; 3812 kvm_requeue_exception_e(vcpu, vector, err); 3813 3814 } else 3815 kvm_requeue_exception(vcpu, vector); 3816 break; 3817 case SVM_EXITINTINFO_TYPE_INTR: 3818 kvm_queue_interrupt(vcpu, vector, false); 3819 break; 3820 default: 3821 break; 3822 } 3823 } 3824 3825 static void svm_cancel_injection(struct kvm_vcpu *vcpu) 3826 { 3827 struct vcpu_svm *svm = to_svm(vcpu); 3828 struct vmcb_control_area *control = &svm->vmcb->control; 3829 3830 control->exit_int_info = control->event_inj; 3831 control->exit_int_info_err = control->event_inj_err; 3832 control->event_inj = 0; 3833 svm_complete_interrupts(vcpu); 3834 } 3835 3836 static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) 3837 { 3838 if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && 3839 to_svm(vcpu)->vmcb->control.exit_info_1) 3840 return handle_fastpath_set_msr_irqoff(vcpu); 3841 3842 return EXIT_FASTPATH_NONE; 3843 } 3844 3845 static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) 3846 { 3847 struct vcpu_svm *svm = to_svm(vcpu); 3848 unsigned long vmcb_pa = svm->current_vmcb->pa; 3849 3850 kvm_guest_enter_irqoff(); 3851 3852 if (sev_es_guest(vcpu->kvm)) { 3853 __svm_sev_es_vcpu_run(vmcb_pa); 3854 } else { 3855 struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); 3856 3857 /* 3858 * Use a single vmcb (vmcb01 because it's always valid) for 3859 * context switching guest state via VMLOAD/VMSAVE, that way 3860 * the state doesn't need to be copied between vmcb01 and 3861 * vmcb02 when switching vmcbs for nested virtualization. 3862 */ 3863 vmload(svm->vmcb01.pa); 3864 __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs); 3865 vmsave(svm->vmcb01.pa); 3866 3867 vmload(__sme_page_pa(sd->save_area)); 3868 } 3869 3870 kvm_guest_exit_irqoff(); 3871 } 3872 3873 static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) 3874 { 3875 struct vcpu_svm *svm = to_svm(vcpu); 3876 3877 trace_kvm_entry(vcpu); 3878 3879 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3880 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3881 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 3882 3883 /* 3884 * Disable singlestep if we're injecting an interrupt/exception. 3885 * We don't want our modified rflags to be pushed on the stack where 3886 * we might not be able to easily reset them if we disabled NMI 3887 * singlestep later. 3888 */ 3889 if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { 3890 /* 3891 * Event injection happens before external interrupts cause a 3892 * vmexit and interrupts are disabled here, so smp_send_reschedule 3893 * is enough to force an immediate vmexit. 3894 */ 3895 disable_nmi_singlestep(svm); 3896 smp_send_reschedule(vcpu->cpu); 3897 } 3898 3899 pre_svm_run(vcpu); 3900 3901 sync_lapic_to_cr8(vcpu); 3902 3903 if (unlikely(svm->asid != svm->vmcb->control.asid)) { 3904 svm->vmcb->control.asid = svm->asid; 3905 vmcb_mark_dirty(svm->vmcb, VMCB_ASID); 3906 } 3907 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3908 3909 svm_hv_update_vp_id(svm->vmcb, vcpu); 3910 3911 /* 3912 * Run with all-zero DR6 unless needed, so that we can get the exact cause 3913 * of a #DB. 3914 */ 3915 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) 3916 svm_set_dr6(svm, vcpu->arch.dr6); 3917 else 3918 svm_set_dr6(svm, DR6_ACTIVE_LOW); 3919 3920 clgi(); 3921 kvm_load_guest_xsave_state(vcpu); 3922 3923 kvm_wait_lapic_expire(vcpu); 3924 3925 /* 3926 * If this vCPU has touched SPEC_CTRL, restore the guest's value if 3927 * it's non-zero. Since vmentry is serialising on affected CPUs, there 3928 * is no need to worry about the conditional branch over the wrmsr 3929 * being speculatively taken. 3930 */ 3931 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 3932 x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); 3933 3934 svm_vcpu_enter_exit(vcpu); 3935 3936 /* 3937 * We do not use IBRS in the kernel. If this vCPU has used the 3938 * SPEC_CTRL MSR it may have left it on; save the value and 3939 * turn it off. This is much more efficient than blindly adding 3940 * it to the atomic save/restore list. Especially as the former 3941 * (Saving guest MSRs on vmexit) doesn't even exist in KVM. 3942 * 3943 * For non-nested case: 3944 * If the L01 MSR bitmap does not intercept the MSR, then we need to 3945 * save it. 3946 * 3947 * For nested case: 3948 * If the L02 MSR bitmap does not intercept the MSR, then we need to 3949 * save it. 3950 */ 3951 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) && 3952 unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) 3953 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); 3954 3955 if (!sev_es_guest(vcpu->kvm)) 3956 reload_tss(vcpu); 3957 3958 if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) 3959 x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); 3960 3961 if (!sev_es_guest(vcpu->kvm)) { 3962 vcpu->arch.cr2 = svm->vmcb->save.cr2; 3963 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; 3964 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; 3965 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; 3966 } 3967 vcpu->arch.regs_dirty = 0; 3968 3969 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 3970 kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); 3971 3972 kvm_load_host_xsave_state(vcpu); 3973 stgi(); 3974 3975 /* Any pending NMI will happen here */ 3976 3977 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) 3978 kvm_after_interrupt(vcpu); 3979 3980 sync_cr8_to_lapic(vcpu); 3981 3982 svm->next_rip = 0; 3983 if (is_guest_mode(vcpu)) { 3984 nested_sync_control_from_vmcb02(svm); 3985 3986 /* Track VMRUNs that have made past consistency checking */ 3987 if (svm->nested.nested_run_pending && 3988 svm->vmcb->control.exit_code != SVM_EXIT_ERR) 3989 ++vcpu->stat.nested_run; 3990 3991 svm->nested.nested_run_pending = 0; 3992 } 3993 3994 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 3995 vmcb_mark_all_clean(svm->vmcb); 3996 3997 /* if exit due to PF check for async PF */ 3998 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) 3999 vcpu->arch.apf.host_apf_flags = 4000 kvm_read_and_reset_apf_flags(); 4001 4002 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; 4003 4004 /* 4005 * We need to handle MC intercepts here before the vcpu has a chance to 4006 * change the physical cpu 4007 */ 4008 if (unlikely(svm->vmcb->control.exit_code == 4009 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 4010 svm_handle_mce(vcpu); 4011 4012 svm_complete_interrupts(vcpu); 4013 4014 if (is_guest_mode(vcpu)) 4015 return EXIT_FASTPATH_NONE; 4016 4017 return svm_exit_handlers_fastpath(vcpu); 4018 } 4019 4020 static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, 4021 int root_level) 4022 { 4023 struct vcpu_svm *svm = to_svm(vcpu); 4024 unsigned long cr3; 4025 4026 if (npt_enabled) { 4027 svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); 4028 vmcb_mark_dirty(svm->vmcb, VMCB_NPT); 4029 4030 hv_track_root_tdp(vcpu, root_hpa); 4031 4032 cr3 = vcpu->arch.cr3; 4033 } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { 4034 cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); 4035 } else { 4036 /* PCID in the guest should be impossible with a 32-bit MMU. */ 4037 WARN_ON_ONCE(kvm_get_active_pcid(vcpu)); 4038 cr3 = root_hpa; 4039 } 4040 4041 svm->vmcb->save.cr3 = cr3; 4042 vmcb_mark_dirty(svm->vmcb, VMCB_CR); 4043 } 4044 4045 static int is_disabled(void) 4046 { 4047 u64 vm_cr; 4048 4049 rdmsrl(MSR_VM_CR, vm_cr); 4050 if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) 4051 return 1; 4052 4053 return 0; 4054 } 4055 4056 static void 4057 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 4058 { 4059 /* 4060 * Patch in the VMMCALL instruction: 4061 */ 4062 hypercall[0] = 0x0f; 4063 hypercall[1] = 0x01; 4064 hypercall[2] = 0xd9; 4065 } 4066 4067 static int __init svm_check_processor_compat(void) 4068 { 4069 return 0; 4070 } 4071 4072 static bool svm_cpu_has_accelerated_tpr(void) 4073 { 4074 return false; 4075 } 4076 4077 /* 4078 * The kvm parameter can be NULL (module initialization, or invocation before 4079 * VM creation). Be sure to check the kvm parameter before using it. 4080 */ 4081 static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) 4082 { 4083 switch (index) { 4084 case MSR_IA32_MCG_EXT_CTL: 4085 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 4086 return false; 4087 case MSR_IA32_SMBASE: 4088 /* SEV-ES guests do not support SMM, so report false */ 4089 if (kvm && sev_es_guest(kvm)) 4090 return false; 4091 break; 4092 default: 4093 break; 4094 } 4095 4096 return true; 4097 } 4098 4099 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 4100 { 4101 return 0; 4102 } 4103 4104 static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 4105 { 4106 struct vcpu_svm *svm = to_svm(vcpu); 4107 struct kvm_cpuid_entry2 *best; 4108 4109 vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && 4110 boot_cpu_has(X86_FEATURE_XSAVE) && 4111 boot_cpu_has(X86_FEATURE_XSAVES); 4112 4113 /* Update nrips enabled cache */ 4114 svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && 4115 guest_cpuid_has(vcpu, X86_FEATURE_NRIPS); 4116 4117 svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR); 4118 4119 svm_recalc_instruction_intercepts(vcpu, svm); 4120 4121 /* For sev guests, the memory encryption bit is not reserved in CR3. */ 4122 if (sev_guest(vcpu->kvm)) { 4123 best = kvm_find_cpuid_entry(vcpu, 0x8000001F, 0); 4124 if (best) 4125 vcpu->arch.reserved_gpa_bits &= ~(1UL << (best->ebx & 0x3f)); 4126 } 4127 4128 if (kvm_vcpu_apicv_active(vcpu)) { 4129 /* 4130 * AVIC does not work with an x2APIC mode guest. If the X2APIC feature 4131 * is exposed to the guest, disable AVIC. 4132 */ 4133 if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) 4134 kvm_request_apicv_update(vcpu->kvm, false, 4135 APICV_INHIBIT_REASON_X2APIC); 4136 4137 /* 4138 * Currently, AVIC does not work with nested virtualization. 4139 * So, we disable AVIC when cpuid for SVM is set in the L1 guest. 4140 */ 4141 if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 4142 kvm_request_apicv_update(vcpu->kvm, false, 4143 APICV_INHIBIT_REASON_NESTED); 4144 } 4145 init_vmcb_after_set_cpuid(vcpu); 4146 } 4147 4148 static bool svm_has_wbinvd_exit(void) 4149 { 4150 return true; 4151 } 4152 4153 #define PRE_EX(exit) { .exit_code = (exit), \ 4154 .stage = X86_ICPT_PRE_EXCEPT, } 4155 #define POST_EX(exit) { .exit_code = (exit), \ 4156 .stage = X86_ICPT_POST_EXCEPT, } 4157 #define POST_MEM(exit) { .exit_code = (exit), \ 4158 .stage = X86_ICPT_POST_MEMACCESS, } 4159 4160 static const struct __x86_intercept { 4161 u32 exit_code; 4162 enum x86_intercept_stage stage; 4163 } x86_intercept_map[] = { 4164 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), 4165 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), 4166 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), 4167 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), 4168 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), 4169 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), 4170 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), 4171 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), 4172 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), 4173 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), 4174 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), 4175 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), 4176 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), 4177 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), 4178 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), 4179 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), 4180 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), 4181 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), 4182 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), 4183 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), 4184 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), 4185 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), 4186 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), 4187 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), 4188 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), 4189 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), 4190 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), 4191 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), 4192 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), 4193 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), 4194 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), 4195 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), 4196 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), 4197 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), 4198 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), 4199 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), 4200 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), 4201 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), 4202 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), 4203 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), 4204 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), 4205 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), 4206 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), 4207 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), 4208 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), 4209 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), 4210 [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), 4211 }; 4212 4213 #undef PRE_EX 4214 #undef POST_EX 4215 #undef POST_MEM 4216 4217 static int svm_check_intercept(struct kvm_vcpu *vcpu, 4218 struct x86_instruction_info *info, 4219 enum x86_intercept_stage stage, 4220 struct x86_exception *exception) 4221 { 4222 struct vcpu_svm *svm = to_svm(vcpu); 4223 int vmexit, ret = X86EMUL_CONTINUE; 4224 struct __x86_intercept icpt_info; 4225 struct vmcb *vmcb = svm->vmcb; 4226 4227 if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) 4228 goto out; 4229 4230 icpt_info = x86_intercept_map[info->intercept]; 4231 4232 if (stage != icpt_info.stage) 4233 goto out; 4234 4235 switch (icpt_info.exit_code) { 4236 case SVM_EXIT_READ_CR0: 4237 if (info->intercept == x86_intercept_cr_read) 4238 icpt_info.exit_code += info->modrm_reg; 4239 break; 4240 case SVM_EXIT_WRITE_CR0: { 4241 unsigned long cr0, val; 4242 4243 if (info->intercept == x86_intercept_cr_write) 4244 icpt_info.exit_code += info->modrm_reg; 4245 4246 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || 4247 info->intercept == x86_intercept_clts) 4248 break; 4249 4250 if (!(vmcb12_is_intercept(&svm->nested.ctl, 4251 INTERCEPT_SELECTIVE_CR0))) 4252 break; 4253 4254 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; 4255 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; 4256 4257 if (info->intercept == x86_intercept_lmsw) { 4258 cr0 &= 0xfUL; 4259 val &= 0xfUL; 4260 /* lmsw can't clear PE - catch this here */ 4261 if (cr0 & X86_CR0_PE) 4262 val |= X86_CR0_PE; 4263 } 4264 4265 if (cr0 ^ val) 4266 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; 4267 4268 break; 4269 } 4270 case SVM_EXIT_READ_DR0: 4271 case SVM_EXIT_WRITE_DR0: 4272 icpt_info.exit_code += info->modrm_reg; 4273 break; 4274 case SVM_EXIT_MSR: 4275 if (info->intercept == x86_intercept_wrmsr) 4276 vmcb->control.exit_info_1 = 1; 4277 else 4278 vmcb->control.exit_info_1 = 0; 4279 break; 4280 case SVM_EXIT_PAUSE: 4281 /* 4282 * We get this for NOP only, but pause 4283 * is rep not, check this here 4284 */ 4285 if (info->rep_prefix != REPE_PREFIX) 4286 goto out; 4287 break; 4288 case SVM_EXIT_IOIO: { 4289 u64 exit_info; 4290 u32 bytes; 4291 4292 if (info->intercept == x86_intercept_in || 4293 info->intercept == x86_intercept_ins) { 4294 exit_info = ((info->src_val & 0xffff) << 16) | 4295 SVM_IOIO_TYPE_MASK; 4296 bytes = info->dst_bytes; 4297 } else { 4298 exit_info = (info->dst_val & 0xffff) << 16; 4299 bytes = info->src_bytes; 4300 } 4301 4302 if (info->intercept == x86_intercept_outs || 4303 info->intercept == x86_intercept_ins) 4304 exit_info |= SVM_IOIO_STR_MASK; 4305 4306 if (info->rep_prefix) 4307 exit_info |= SVM_IOIO_REP_MASK; 4308 4309 bytes = min(bytes, 4u); 4310 4311 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; 4312 4313 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); 4314 4315 vmcb->control.exit_info_1 = exit_info; 4316 vmcb->control.exit_info_2 = info->next_rip; 4317 4318 break; 4319 } 4320 default: 4321 break; 4322 } 4323 4324 /* TODO: Advertise NRIPS to guest hypervisor unconditionally */ 4325 if (static_cpu_has(X86_FEATURE_NRIPS)) 4326 vmcb->control.next_rip = info->next_rip; 4327 vmcb->control.exit_code = icpt_info.exit_code; 4328 vmexit = nested_svm_exit_handled(svm); 4329 4330 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED 4331 : X86EMUL_CONTINUE; 4332 4333 out: 4334 return ret; 4335 } 4336 4337 static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) 4338 { 4339 } 4340 4341 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) 4342 { 4343 if (!kvm_pause_in_guest(vcpu->kvm)) 4344 shrink_ple_window(vcpu); 4345 } 4346 4347 static void svm_setup_mce(struct kvm_vcpu *vcpu) 4348 { 4349 /* [63:9] are reserved. */ 4350 vcpu->arch.mcg_cap &= 0x1ff; 4351 } 4352 4353 bool svm_smi_blocked(struct kvm_vcpu *vcpu) 4354 { 4355 struct vcpu_svm *svm = to_svm(vcpu); 4356 4357 /* Per APM Vol.2 15.22.2 "Response to SMI" */ 4358 if (!gif_set(svm)) 4359 return true; 4360 4361 return is_smm(vcpu); 4362 } 4363 4364 static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) 4365 { 4366 struct vcpu_svm *svm = to_svm(vcpu); 4367 if (svm->nested.nested_run_pending) 4368 return -EBUSY; 4369 4370 /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ 4371 if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) 4372 return -EBUSY; 4373 4374 return !svm_smi_blocked(vcpu); 4375 } 4376 4377 static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) 4378 { 4379 struct vcpu_svm *svm = to_svm(vcpu); 4380 struct kvm_host_map map_save; 4381 int ret; 4382 4383 if (!is_guest_mode(vcpu)) 4384 return 0; 4385 4386 /* FED8h - SVM Guest */ 4387 put_smstate(u64, smstate, 0x7ed8, 1); 4388 /* FEE0h - SVM Guest VMCB Physical Address */ 4389 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb12_gpa); 4390 4391 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 4392 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 4393 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 4394 4395 ret = nested_svm_vmexit(svm); 4396 if (ret) 4397 return ret; 4398 4399 /* 4400 * KVM uses VMCB01 to store L1 host state while L2 runs but 4401 * VMCB01 is going to be used during SMM and thus the state will 4402 * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save 4403 * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the 4404 * format of the area is identical to guest save area offsetted 4405 * by 0x400 (matches the offset of 'struct vmcb_save_area' 4406 * within 'struct vmcb'). Note: HSAVE area may also be used by 4407 * L1 hypervisor to save additional host context (e.g. KVM does 4408 * that, see svm_prepare_guest_switch()) which must be 4409 * preserved. 4410 */ 4411 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), 4412 &map_save) == -EINVAL) 4413 return 1; 4414 4415 BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); 4416 4417 svm_copy_vmrun_state(map_save.hva + 0x400, 4418 &svm->vmcb01.ptr->save); 4419 4420 kvm_vcpu_unmap(vcpu, &map_save, true); 4421 return 0; 4422 } 4423 4424 static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) 4425 { 4426 struct vcpu_svm *svm = to_svm(vcpu); 4427 struct kvm_host_map map, map_save; 4428 u64 saved_efer, vmcb12_gpa; 4429 struct vmcb *vmcb12; 4430 int ret; 4431 4432 if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) 4433 return 0; 4434 4435 /* Non-zero if SMI arrived while vCPU was in guest mode. */ 4436 if (!GET_SMSTATE(u64, smstate, 0x7ed8)) 4437 return 0; 4438 4439 if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) 4440 return 1; 4441 4442 saved_efer = GET_SMSTATE(u64, smstate, 0x7ed0); 4443 if (!(saved_efer & EFER_SVME)) 4444 return 1; 4445 4446 vmcb12_gpa = GET_SMSTATE(u64, smstate, 0x7ee0); 4447 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map) == -EINVAL) 4448 return 1; 4449 4450 ret = 1; 4451 if (kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.hsave_msr), &map_save) == -EINVAL) 4452 goto unmap_map; 4453 4454 if (svm_allocate_nested(svm)) 4455 goto unmap_save; 4456 4457 /* 4458 * Restore L1 host state from L1 HSAVE area as VMCB01 was 4459 * used during SMM (see svm_enter_smm()) 4460 */ 4461 4462 svm_copy_vmrun_state(&svm->vmcb01.ptr->save, map_save.hva + 0x400); 4463 4464 /* 4465 * Enter the nested guest now 4466 */ 4467 4468 vmcb12 = map.hva; 4469 nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); 4470 nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); 4471 ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); 4472 4473 unmap_save: 4474 kvm_vcpu_unmap(vcpu, &map_save, true); 4475 unmap_map: 4476 kvm_vcpu_unmap(vcpu, &map, true); 4477 return ret; 4478 } 4479 4480 static void svm_enable_smi_window(struct kvm_vcpu *vcpu) 4481 { 4482 struct vcpu_svm *svm = to_svm(vcpu); 4483 4484 if (!gif_set(svm)) { 4485 if (vgif_enabled(svm)) 4486 svm_set_intercept(svm, INTERCEPT_STGI); 4487 /* STGI will cause a vm exit */ 4488 } else { 4489 /* We must be in SMM; RSM will cause a vmexit anyway. */ 4490 } 4491 } 4492 4493 static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len) 4494 { 4495 bool smep, smap, is_user; 4496 unsigned long cr4; 4497 4498 /* 4499 * When the guest is an SEV-ES guest, emulation is not possible. 4500 */ 4501 if (sev_es_guest(vcpu->kvm)) 4502 return false; 4503 4504 /* 4505 * Detect and workaround Errata 1096 Fam_17h_00_0Fh. 4506 * 4507 * Errata: 4508 * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is 4509 * possible that CPU microcode implementing DecodeAssist will fail 4510 * to read bytes of instruction which caused #NPF. In this case, 4511 * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly 4512 * return 0 instead of the correct guest instruction bytes. 4513 * 4514 * This happens because CPU microcode reading instruction bytes 4515 * uses a special opcode which attempts to read data using CPL=0 4516 * privileges. The microcode reads CS:RIP and if it hits a SMAP 4517 * fault, it gives up and returns no instruction bytes. 4518 * 4519 * Detection: 4520 * We reach here in case CPU supports DecodeAssist, raised #NPF and 4521 * returned 0 in GuestIntrBytes field of the VMCB. 4522 * First, errata can only be triggered in case vCPU CR4.SMAP=1. 4523 * Second, if vCPU CR4.SMEP=1, errata could only be triggered 4524 * in case vCPU CPL==3 (Because otherwise guest would have triggered 4525 * a SMEP fault instead of #NPF). 4526 * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL. 4527 * As most guests enable SMAP if they have also enabled SMEP, use above 4528 * logic in order to attempt minimize false-positive of detecting errata 4529 * while still preserving all cases semantic correctness. 4530 * 4531 * Workaround: 4532 * To determine what instruction the guest was executing, the hypervisor 4533 * will have to decode the instruction at the instruction pointer. 4534 * 4535 * In non SEV guest, hypervisor will be able to read the guest 4536 * memory to decode the instruction pointer when insn_len is zero 4537 * so we return true to indicate that decoding is possible. 4538 * 4539 * But in the SEV guest, the guest memory is encrypted with the 4540 * guest specific key and hypervisor will not be able to decode the 4541 * instruction pointer so we will not able to workaround it. Lets 4542 * print the error and request to kill the guest. 4543 */ 4544 if (likely(!insn || insn_len)) 4545 return true; 4546 4547 /* 4548 * If RIP is invalid, go ahead with emulation which will cause an 4549 * internal error exit. 4550 */ 4551 if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT)) 4552 return true; 4553 4554 cr4 = kvm_read_cr4(vcpu); 4555 smep = cr4 & X86_CR4_SMEP; 4556 smap = cr4 & X86_CR4_SMAP; 4557 is_user = svm_get_cpl(vcpu) == 3; 4558 if (smap && (!smep || is_user)) { 4559 if (!sev_guest(vcpu->kvm)) 4560 return true; 4561 4562 pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); 4563 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4564 } 4565 4566 return false; 4567 } 4568 4569 static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) 4570 { 4571 struct vcpu_svm *svm = to_svm(vcpu); 4572 4573 /* 4574 * TODO: Last condition latch INIT signals on vCPU when 4575 * vCPU is in guest-mode and vmcb12 defines intercept on INIT. 4576 * To properly emulate the INIT intercept, 4577 * svm_check_nested_events() should call nested_svm_vmexit() 4578 * if an INIT signal is pending. 4579 */ 4580 return !gif_set(svm) || 4581 (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT)); 4582 } 4583 4584 static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) 4585 { 4586 if (!sev_es_guest(vcpu->kvm)) 4587 return kvm_vcpu_deliver_sipi_vector(vcpu, vector); 4588 4589 sev_vcpu_deliver_sipi_vector(vcpu, vector); 4590 } 4591 4592 static void svm_vm_destroy(struct kvm *kvm) 4593 { 4594 avic_vm_destroy(kvm); 4595 sev_vm_destroy(kvm); 4596 } 4597 4598 static int svm_vm_init(struct kvm *kvm) 4599 { 4600 if (!pause_filter_count || !pause_filter_thresh) 4601 kvm->arch.pause_in_guest = true; 4602 4603 if (enable_apicv) { 4604 int ret = avic_vm_init(kvm); 4605 if (ret) 4606 return ret; 4607 } 4608 4609 return 0; 4610 } 4611 4612 static struct kvm_x86_ops svm_x86_ops __initdata = { 4613 .name = "kvm_amd", 4614 4615 .hardware_unsetup = svm_hardware_teardown, 4616 .hardware_enable = svm_hardware_enable, 4617 .hardware_disable = svm_hardware_disable, 4618 .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, 4619 .has_emulated_msr = svm_has_emulated_msr, 4620 4621 .vcpu_create = svm_create_vcpu, 4622 .vcpu_free = svm_free_vcpu, 4623 .vcpu_reset = svm_vcpu_reset, 4624 4625 .vm_size = sizeof(struct kvm_svm), 4626 .vm_init = svm_vm_init, 4627 .vm_destroy = svm_vm_destroy, 4628 4629 .prepare_guest_switch = svm_prepare_guest_switch, 4630 .vcpu_load = svm_vcpu_load, 4631 .vcpu_put = svm_vcpu_put, 4632 .vcpu_blocking = svm_vcpu_blocking, 4633 .vcpu_unblocking = svm_vcpu_unblocking, 4634 4635 .update_exception_bitmap = svm_update_exception_bitmap, 4636 .get_msr_feature = svm_get_msr_feature, 4637 .get_msr = svm_get_msr, 4638 .set_msr = svm_set_msr, 4639 .get_segment_base = svm_get_segment_base, 4640 .get_segment = svm_get_segment, 4641 .set_segment = svm_set_segment, 4642 .get_cpl = svm_get_cpl, 4643 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 4644 .set_cr0 = svm_set_cr0, 4645 .post_set_cr3 = svm_post_set_cr3, 4646 .is_valid_cr4 = svm_is_valid_cr4, 4647 .set_cr4 = svm_set_cr4, 4648 .set_efer = svm_set_efer, 4649 .get_idt = svm_get_idt, 4650 .set_idt = svm_set_idt, 4651 .get_gdt = svm_get_gdt, 4652 .set_gdt = svm_set_gdt, 4653 .set_dr7 = svm_set_dr7, 4654 .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, 4655 .cache_reg = svm_cache_reg, 4656 .get_rflags = svm_get_rflags, 4657 .set_rflags = svm_set_rflags, 4658 .get_if_flag = svm_get_if_flag, 4659 4660 .tlb_flush_all = svm_flush_tlb, 4661 .tlb_flush_current = svm_flush_tlb, 4662 .tlb_flush_gva = svm_flush_tlb_gva, 4663 .tlb_flush_guest = svm_flush_tlb, 4664 4665 .run = svm_vcpu_run, 4666 .handle_exit = handle_exit, 4667 .skip_emulated_instruction = skip_emulated_instruction, 4668 .update_emulated_instruction = NULL, 4669 .set_interrupt_shadow = svm_set_interrupt_shadow, 4670 .get_interrupt_shadow = svm_get_interrupt_shadow, 4671 .patch_hypercall = svm_patch_hypercall, 4672 .set_irq = svm_set_irq, 4673 .set_nmi = svm_inject_nmi, 4674 .queue_exception = svm_queue_exception, 4675 .cancel_injection = svm_cancel_injection, 4676 .interrupt_allowed = svm_interrupt_allowed, 4677 .nmi_allowed = svm_nmi_allowed, 4678 .get_nmi_mask = svm_get_nmi_mask, 4679 .set_nmi_mask = svm_set_nmi_mask, 4680 .enable_nmi_window = svm_enable_nmi_window, 4681 .enable_irq_window = svm_enable_irq_window, 4682 .update_cr8_intercept = svm_update_cr8_intercept, 4683 .set_virtual_apic_mode = svm_set_virtual_apic_mode, 4684 .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, 4685 .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, 4686 .load_eoi_exitmap = svm_load_eoi_exitmap, 4687 .hwapic_irr_update = svm_hwapic_irr_update, 4688 .hwapic_isr_update = svm_hwapic_isr_update, 4689 .apicv_post_state_restore = avic_post_state_restore, 4690 4691 .set_tss_addr = svm_set_tss_addr, 4692 .set_identity_map_addr = svm_set_identity_map_addr, 4693 .get_mt_mask = svm_get_mt_mask, 4694 4695 .get_exit_info = svm_get_exit_info, 4696 4697 .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, 4698 4699 .has_wbinvd_exit = svm_has_wbinvd_exit, 4700 4701 .get_l2_tsc_offset = svm_get_l2_tsc_offset, 4702 .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, 4703 .write_tsc_offset = svm_write_tsc_offset, 4704 .write_tsc_multiplier = svm_write_tsc_multiplier, 4705 4706 .load_mmu_pgd = svm_load_mmu_pgd, 4707 4708 .check_intercept = svm_check_intercept, 4709 .handle_exit_irqoff = svm_handle_exit_irqoff, 4710 4711 .request_immediate_exit = __kvm_request_immediate_exit, 4712 4713 .sched_in = svm_sched_in, 4714 4715 .pmu_ops = &amd_pmu_ops, 4716 .nested_ops = &svm_nested_ops, 4717 4718 .deliver_posted_interrupt = svm_deliver_avic_intr, 4719 .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, 4720 .update_pi_irte = svm_update_pi_irte, 4721 .setup_mce = svm_setup_mce, 4722 4723 .smi_allowed = svm_smi_allowed, 4724 .enter_smm = svm_enter_smm, 4725 .leave_smm = svm_leave_smm, 4726 .enable_smi_window = svm_enable_smi_window, 4727 4728 .mem_enc_op = svm_mem_enc_op, 4729 .mem_enc_reg_region = svm_register_enc_region, 4730 .mem_enc_unreg_region = svm_unregister_enc_region, 4731 4732 .vm_copy_enc_context_from = svm_vm_copy_asid_from, 4733 .vm_move_enc_context_from = svm_vm_migrate_from, 4734 4735 .can_emulate_instruction = svm_can_emulate_instruction, 4736 4737 .apic_init_signal_blocked = svm_apic_init_signal_blocked, 4738 4739 .msr_filter_changed = svm_msr_filter_changed, 4740 .complete_emulated_msr = svm_complete_emulated_msr, 4741 4742 .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, 4743 }; 4744 4745 static struct kvm_x86_init_ops svm_init_ops __initdata = { 4746 .cpu_has_kvm_support = has_svm, 4747 .disabled_by_bios = is_disabled, 4748 .hardware_setup = svm_hardware_setup, 4749 .check_processor_compatibility = svm_check_processor_compat, 4750 4751 .runtime_ops = &svm_x86_ops, 4752 }; 4753 4754 static int __init svm_init(void) 4755 { 4756 __unused_size_checks(); 4757 4758 return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), 4759 __alignof__(struct vcpu_svm), THIS_MODULE); 4760 } 4761 4762 static void __exit svm_exit(void) 4763 { 4764 kvm_exit(); 4765 } 4766 4767 module_init(svm_init) 4768 module_exit(svm_exit) 4769