1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * cpuid support routines 5 * 6 * derived from arch/x86/kvm/x86.c 7 * 8 * Copyright 2011 Red Hat, Inc. and/or its affiliates. 9 * Copyright IBM Corporation, 2008 10 */ 11 12 #include <linux/kvm_host.h> 13 #include <linux/export.h> 14 #include <linux/vmalloc.h> 15 #include <linux/uaccess.h> 16 #include <linux/sched/stat.h> 17 18 #include <asm/processor.h> 19 #include <asm/user.h> 20 #include <asm/fpu/xstate.h> 21 #include "cpuid.h" 22 #include "lapic.h" 23 #include "mmu.h" 24 #include "trace.h" 25 #include "pmu.h" 26 27 /* 28 * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be 29 * aligned to sizeof(unsigned long) because it's not accessed via bitops. 30 */ 31 u32 kvm_cpu_caps[NCAPINTS] __read_mostly; 32 EXPORT_SYMBOL_GPL(kvm_cpu_caps); 33 34 static u32 xstate_required_size(u64 xstate_bv, bool compacted) 35 { 36 int feature_bit = 0; 37 u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; 38 39 xstate_bv &= XFEATURE_MASK_EXTEND; 40 while (xstate_bv) { 41 if (xstate_bv & 0x1) { 42 u32 eax, ebx, ecx, edx, offset; 43 cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx); 44 offset = compacted ? ret : ebx; 45 ret = max(ret, offset + eax); 46 } 47 48 xstate_bv >>= 1; 49 feature_bit++; 50 } 51 52 return ret; 53 } 54 55 #define F feature_bit 56 57 static inline struct kvm_cpuid_entry2 *cpuid_entry2_find( 58 struct kvm_cpuid_entry2 *entries, int nent, u32 function, u32 index) 59 { 60 struct kvm_cpuid_entry2 *e; 61 int i; 62 63 for (i = 0; i < nent; i++) { 64 e = &entries[i]; 65 66 if (e->function == function && (e->index == index || 67 !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) 68 return e; 69 } 70 71 return NULL; 72 } 73 74 static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) 75 { 76 struct kvm_cpuid_entry2 *best; 77 78 /* 79 * The existing code assumes virtual address is 48-bit or 57-bit in the 80 * canonical address checks; exit if it is ever changed. 81 */ 82 best = cpuid_entry2_find(entries, nent, 0x80000008, 0); 83 if (best) { 84 int vaddr_bits = (best->eax & 0xff00) >> 8; 85 86 if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) 87 return -EINVAL; 88 } 89 90 return 0; 91 } 92 93 void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) 94 { 95 struct kvm_cpuid_entry2 *best; 96 97 best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); 98 99 /* 100 * save the feature bitmap to avoid cpuid lookup for every PV 101 * operation 102 */ 103 if (best) 104 vcpu->arch.pv_cpuid.features = best->eax; 105 } 106 107 void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) 108 { 109 struct kvm_cpuid_entry2 *best; 110 111 best = kvm_find_cpuid_entry(vcpu, 1, 0); 112 if (best) { 113 /* Update OSXSAVE bit */ 114 if (boot_cpu_has(X86_FEATURE_XSAVE)) 115 cpuid_entry_change(best, X86_FEATURE_OSXSAVE, 116 kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)); 117 118 cpuid_entry_change(best, X86_FEATURE_APIC, 119 vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); 120 } 121 122 best = kvm_find_cpuid_entry(vcpu, 7, 0); 123 if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) 124 cpuid_entry_change(best, X86_FEATURE_OSPKE, 125 kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); 126 127 best = kvm_find_cpuid_entry(vcpu, 0xD, 0); 128 if (best) 129 best->ebx = xstate_required_size(vcpu->arch.xcr0, false); 130 131 best = kvm_find_cpuid_entry(vcpu, 0xD, 1); 132 if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || 133 cpuid_entry_has(best, X86_FEATURE_XSAVEC))) 134 best->ebx = xstate_required_size(vcpu->arch.xcr0, true); 135 136 best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); 137 if (kvm_hlt_in_guest(vcpu->kvm) && best && 138 (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) 139 best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); 140 141 if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { 142 best = kvm_find_cpuid_entry(vcpu, 0x1, 0); 143 if (best) 144 cpuid_entry_change(best, X86_FEATURE_MWAIT, 145 vcpu->arch.ia32_misc_enable_msr & 146 MSR_IA32_MISC_ENABLE_MWAIT); 147 } 148 } 149 EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime); 150 151 static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) 152 { 153 struct kvm_lapic *apic = vcpu->arch.apic; 154 struct kvm_cpuid_entry2 *best; 155 156 best = kvm_find_cpuid_entry(vcpu, 1, 0); 157 if (best && apic) { 158 if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) 159 apic->lapic_timer.timer_mode_mask = 3 << 17; 160 else 161 apic->lapic_timer.timer_mode_mask = 1 << 17; 162 163 kvm_apic_set_version(vcpu); 164 } 165 166 best = kvm_find_cpuid_entry(vcpu, 0xD, 0); 167 if (!best) 168 vcpu->arch.guest_supported_xcr0 = 0; 169 else 170 vcpu->arch.guest_supported_xcr0 = 171 (best->eax | ((u64)best->edx << 32)) & supported_xcr0; 172 173 kvm_update_pv_runtime(vcpu); 174 175 vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); 176 kvm_mmu_reset_context(vcpu); 177 178 kvm_pmu_refresh(vcpu); 179 vcpu->arch.cr4_guest_rsvd_bits = 180 __cr4_reserved_bits(guest_cpuid_has, vcpu); 181 182 vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); 183 184 /* Invoke the vendor callback only after the above state is updated. */ 185 kvm_x86_ops.vcpu_after_set_cpuid(vcpu); 186 } 187 188 static int is_efer_nx(void) 189 { 190 return host_efer & EFER_NX; 191 } 192 193 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) 194 { 195 int i; 196 struct kvm_cpuid_entry2 *e, *entry; 197 198 entry = NULL; 199 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 200 e = &vcpu->arch.cpuid_entries[i]; 201 if (e->function == 0x80000001) { 202 entry = e; 203 break; 204 } 205 } 206 if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) { 207 cpuid_entry_clear(entry, X86_FEATURE_NX); 208 printk(KERN_INFO "kvm: guest NX capability removed\n"); 209 } 210 } 211 212 int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) 213 { 214 struct kvm_cpuid_entry2 *best; 215 216 best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); 217 if (!best || best->eax < 0x80000008) 218 goto not_found; 219 best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); 220 if (best) 221 return best->eax & 0xff; 222 not_found: 223 return 36; 224 } 225 226 /* when an old userspace process fills a new kernel module */ 227 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, 228 struct kvm_cpuid *cpuid, 229 struct kvm_cpuid_entry __user *entries) 230 { 231 int r, i; 232 struct kvm_cpuid_entry *e = NULL; 233 struct kvm_cpuid_entry2 *e2 = NULL; 234 235 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 236 return -E2BIG; 237 238 if (cpuid->nent) { 239 e = vmemdup_user(entries, array_size(sizeof(*e), cpuid->nent)); 240 if (IS_ERR(e)) 241 return PTR_ERR(e); 242 243 e2 = kvmalloc_array(cpuid->nent, sizeof(*e2), GFP_KERNEL_ACCOUNT); 244 if (!e2) { 245 r = -ENOMEM; 246 goto out_free_cpuid; 247 } 248 } 249 for (i = 0; i < cpuid->nent; i++) { 250 e2[i].function = e[i].function; 251 e2[i].eax = e[i].eax; 252 e2[i].ebx = e[i].ebx; 253 e2[i].ecx = e[i].ecx; 254 e2[i].edx = e[i].edx; 255 e2[i].index = 0; 256 e2[i].flags = 0; 257 e2[i].padding[0] = 0; 258 e2[i].padding[1] = 0; 259 e2[i].padding[2] = 0; 260 } 261 262 r = kvm_check_cpuid(e2, cpuid->nent); 263 if (r) { 264 kvfree(e2); 265 goto out_free_cpuid; 266 } 267 268 kvfree(vcpu->arch.cpuid_entries); 269 vcpu->arch.cpuid_entries = e2; 270 vcpu->arch.cpuid_nent = cpuid->nent; 271 272 cpuid_fix_nx_cap(vcpu); 273 kvm_update_cpuid_runtime(vcpu); 274 kvm_vcpu_after_set_cpuid(vcpu); 275 276 out_free_cpuid: 277 kvfree(e); 278 279 return r; 280 } 281 282 int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, 283 struct kvm_cpuid2 *cpuid, 284 struct kvm_cpuid_entry2 __user *entries) 285 { 286 struct kvm_cpuid_entry2 *e2 = NULL; 287 int r; 288 289 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 290 return -E2BIG; 291 292 if (cpuid->nent) { 293 e2 = vmemdup_user(entries, array_size(sizeof(*e2), cpuid->nent)); 294 if (IS_ERR(e2)) 295 return PTR_ERR(e2); 296 } 297 298 r = kvm_check_cpuid(e2, cpuid->nent); 299 if (r) { 300 kvfree(e2); 301 return r; 302 } 303 304 kvfree(vcpu->arch.cpuid_entries); 305 vcpu->arch.cpuid_entries = e2; 306 vcpu->arch.cpuid_nent = cpuid->nent; 307 308 kvm_update_cpuid_runtime(vcpu); 309 kvm_vcpu_after_set_cpuid(vcpu); 310 311 return 0; 312 } 313 314 int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, 315 struct kvm_cpuid2 *cpuid, 316 struct kvm_cpuid_entry2 __user *entries) 317 { 318 int r; 319 320 r = -E2BIG; 321 if (cpuid->nent < vcpu->arch.cpuid_nent) 322 goto out; 323 r = -EFAULT; 324 if (copy_to_user(entries, &vcpu->arch.cpuid_entries, 325 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) 326 goto out; 327 return 0; 328 329 out: 330 cpuid->nent = vcpu->arch.cpuid_nent; 331 return r; 332 } 333 334 static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) 335 { 336 const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); 337 struct kvm_cpuid_entry2 entry; 338 339 reverse_cpuid_check(leaf); 340 kvm_cpu_caps[leaf] &= mask; 341 342 cpuid_count(cpuid.function, cpuid.index, 343 &entry.eax, &entry.ebx, &entry.ecx, &entry.edx); 344 345 kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); 346 } 347 348 void kvm_set_cpu_caps(void) 349 { 350 unsigned int f_nx = is_efer_nx() ? F(NX) : 0; 351 #ifdef CONFIG_X86_64 352 unsigned int f_gbpages = F(GBPAGES); 353 unsigned int f_lm = F(LM); 354 #else 355 unsigned int f_gbpages = 0; 356 unsigned int f_lm = 0; 357 #endif 358 359 BUILD_BUG_ON(sizeof(kvm_cpu_caps) > 360 sizeof(boot_cpu_data.x86_capability)); 361 362 memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability, 363 sizeof(kvm_cpu_caps)); 364 365 kvm_cpu_cap_mask(CPUID_1_ECX, 366 /* 367 * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not* 368 * advertised to guests via CPUID! 369 */ 370 F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 371 0 /* DS-CPL, VMX, SMX, EST */ | 372 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 373 F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) | 374 F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | 375 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 376 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | 377 F(F16C) | F(RDRAND) 378 ); 379 /* KVM emulates x2apic in software irrespective of host support. */ 380 kvm_cpu_cap_set(X86_FEATURE_X2APIC); 381 382 kvm_cpu_cap_mask(CPUID_1_EDX, 383 F(FPU) | F(VME) | F(DE) | F(PSE) | 384 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 385 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | 386 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 387 F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) | 388 0 /* Reserved, DS, ACPI */ | F(MMX) | 389 F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 390 0 /* HTT, TM, Reserved, PBE */ 391 ); 392 393 kvm_cpu_cap_mask(CPUID_7_0_EBX, 394 F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | 395 F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) | 396 F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | 397 F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | 398 F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/ 399 ); 400 401 kvm_cpu_cap_mask(CPUID_7_ECX, 402 F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | 403 F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | 404 F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | 405 F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ 406 ); 407 /* Set LA57 based on hardware capability. */ 408 if (cpuid_ecx(7) & F(LA57)) 409 kvm_cpu_cap_set(X86_FEATURE_LA57); 410 411 /* 412 * PKU not yet implemented for shadow paging and requires OSPKE 413 * to be set on the host. Clear it if that is not the case 414 */ 415 if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) 416 kvm_cpu_cap_clear(X86_FEATURE_PKU); 417 418 kvm_cpu_cap_mask(CPUID_7_EDX, 419 F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | 420 F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | 421 F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | 422 F(SERIALIZE) | F(TSXLDTRK) | F(AVX512_FP16) 423 ); 424 425 /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ 426 kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST); 427 kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES); 428 429 if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) 430 kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL); 431 if (boot_cpu_has(X86_FEATURE_STIBP)) 432 kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); 433 if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) 434 kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); 435 436 kvm_cpu_cap_mask(CPUID_7_1_EAX, 437 F(AVX512_BF16) 438 ); 439 440 kvm_cpu_cap_mask(CPUID_D_1_EAX, 441 F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) 442 ); 443 444 kvm_cpu_cap_mask(CPUID_8000_0001_ECX, 445 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | 446 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 447 F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 448 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | 449 F(TOPOEXT) | F(PERFCTR_CORE) 450 ); 451 452 kvm_cpu_cap_mask(CPUID_8000_0001_EDX, 453 F(FPU) | F(VME) | F(DE) | F(PSE) | 454 F(TSC) | F(MSR) | F(PAE) | F(MCE) | 455 F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | 456 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 457 F(PAT) | F(PSE36) | 0 /* Reserved */ | 458 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 459 F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | 460 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) 461 ); 462 463 if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) 464 kvm_cpu_cap_set(X86_FEATURE_GBPAGES); 465 466 kvm_cpu_cap_mask(CPUID_8000_0008_EBX, 467 F(CLZERO) | F(XSAVEERPTR) | 468 F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | 469 F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) 470 ); 471 472 /* 473 * AMD has separate bits for each SPEC_CTRL bit. 474 * arch/x86/kernel/cpu/bugs.c is kind enough to 475 * record that in cpufeatures so use them. 476 */ 477 if (boot_cpu_has(X86_FEATURE_IBPB)) 478 kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB); 479 if (boot_cpu_has(X86_FEATURE_IBRS)) 480 kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS); 481 if (boot_cpu_has(X86_FEATURE_STIBP)) 482 kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP); 483 if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) 484 kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD); 485 if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) 486 kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO); 487 /* 488 * The preference is to use SPEC CTRL MSR instead of the 489 * VIRT_SPEC MSR. 490 */ 491 if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) && 492 !boot_cpu_has(X86_FEATURE_AMD_SSBD)) 493 kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); 494 495 /* 496 * Hide all SVM features by default, SVM will set the cap bits for 497 * features it emulates and/or exposes for L1. 498 */ 499 kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0); 500 501 kvm_cpu_cap_mask(CPUID_C000_0001_EDX, 502 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | 503 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | 504 F(PMM) | F(PMM_EN) 505 ); 506 } 507 EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); 508 509 struct kvm_cpuid_array { 510 struct kvm_cpuid_entry2 *entries; 511 int maxnent; 512 int nent; 513 }; 514 515 static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, 516 u32 function, u32 index) 517 { 518 struct kvm_cpuid_entry2 *entry; 519 520 if (array->nent >= array->maxnent) 521 return NULL; 522 523 entry = &array->entries[array->nent++]; 524 525 entry->function = function; 526 entry->index = index; 527 entry->flags = 0; 528 529 cpuid_count(entry->function, entry->index, 530 &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); 531 532 switch (function) { 533 case 4: 534 case 7: 535 case 0xb: 536 case 0xd: 537 case 0xf: 538 case 0x10: 539 case 0x12: 540 case 0x14: 541 case 0x17: 542 case 0x18: 543 case 0x1f: 544 case 0x8000001d: 545 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 546 break; 547 } 548 549 return entry; 550 } 551 552 static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) 553 { 554 struct kvm_cpuid_entry2 *entry; 555 556 if (array->nent >= array->maxnent) 557 return -E2BIG; 558 559 entry = &array->entries[array->nent]; 560 entry->function = func; 561 entry->index = 0; 562 entry->flags = 0; 563 564 switch (func) { 565 case 0: 566 entry->eax = 7; 567 ++array->nent; 568 break; 569 case 1: 570 entry->ecx = F(MOVBE); 571 ++array->nent; 572 break; 573 case 7: 574 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 575 entry->eax = 0; 576 entry->ecx = F(RDPID); 577 ++array->nent; 578 default: 579 break; 580 } 581 582 return 0; 583 } 584 585 static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) 586 { 587 struct kvm_cpuid_entry2 *entry; 588 int r, i, max_idx; 589 590 /* all calls to cpuid_count() should be made on the same cpu */ 591 get_cpu(); 592 593 r = -E2BIG; 594 595 entry = do_host_cpuid(array, function, 0); 596 if (!entry) 597 goto out; 598 599 switch (function) { 600 case 0: 601 /* Limited to the highest leaf implemented in KVM. */ 602 entry->eax = min(entry->eax, 0x1fU); 603 break; 604 case 1: 605 cpuid_entry_override(entry, CPUID_1_EDX); 606 cpuid_entry_override(entry, CPUID_1_ECX); 607 break; 608 case 2: 609 /* 610 * On ancient CPUs, function 2 entries are STATEFUL. That is, 611 * CPUID(function=2, index=0) may return different results each 612 * time, with the least-significant byte in EAX enumerating the 613 * number of times software should do CPUID(2, 0). 614 * 615 * Modern CPUs, i.e. every CPU KVM has *ever* run on are less 616 * idiotic. Intel's SDM states that EAX & 0xff "will always 617 * return 01H. Software should ignore this value and not 618 * interpret it as an informational descriptor", while AMD's 619 * APM states that CPUID(2) is reserved. 620 * 621 * WARN if a frankenstein CPU that supports virtualization and 622 * a stateful CPUID.0x2 is encountered. 623 */ 624 WARN_ON_ONCE((entry->eax & 0xff) > 1); 625 break; 626 /* functions 4 and 0x8000001d have additional index. */ 627 case 4: 628 case 0x8000001d: 629 /* 630 * Read entries until the cache type in the previous entry is 631 * zero, i.e. indicates an invalid entry. 632 */ 633 for (i = 1; entry->eax & 0x1f; ++i) { 634 entry = do_host_cpuid(array, function, i); 635 if (!entry) 636 goto out; 637 } 638 break; 639 case 6: /* Thermal management */ 640 entry->eax = 0x4; /* allow ARAT */ 641 entry->ebx = 0; 642 entry->ecx = 0; 643 entry->edx = 0; 644 break; 645 /* function 7 has additional index. */ 646 case 7: 647 entry->eax = min(entry->eax, 1u); 648 cpuid_entry_override(entry, CPUID_7_0_EBX); 649 cpuid_entry_override(entry, CPUID_7_ECX); 650 cpuid_entry_override(entry, CPUID_7_EDX); 651 652 /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */ 653 if (entry->eax == 1) { 654 entry = do_host_cpuid(array, function, 1); 655 if (!entry) 656 goto out; 657 658 cpuid_entry_override(entry, CPUID_7_1_EAX); 659 entry->ebx = 0; 660 entry->ecx = 0; 661 entry->edx = 0; 662 } 663 break; 664 case 9: 665 break; 666 case 0xa: { /* Architectural Performance Monitoring */ 667 struct x86_pmu_capability cap; 668 union cpuid10_eax eax; 669 union cpuid10_edx edx; 670 671 perf_get_x86_pmu_capability(&cap); 672 673 /* 674 * Only support guest architectural pmu on a host 675 * with architectural pmu. 676 */ 677 if (!cap.version) 678 memset(&cap, 0, sizeof(cap)); 679 680 eax.split.version_id = min(cap.version, 2); 681 eax.split.num_counters = cap.num_counters_gp; 682 eax.split.bit_width = cap.bit_width_gp; 683 eax.split.mask_length = cap.events_mask_len; 684 685 edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); 686 edx.split.bit_width_fixed = cap.bit_width_fixed; 687 edx.split.anythread_deprecated = 1; 688 edx.split.reserved1 = 0; 689 edx.split.reserved2 = 0; 690 691 entry->eax = eax.full; 692 entry->ebx = cap.events_mask; 693 entry->ecx = 0; 694 entry->edx = edx.full; 695 break; 696 } 697 /* 698 * Per Intel's SDM, the 0x1f is a superset of 0xb, 699 * thus they can be handled by common code. 700 */ 701 case 0x1f: 702 case 0xb: 703 /* 704 * Populate entries until the level type (ECX[15:8]) of the 705 * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is 706 * the starting entry, filled by the primary do_host_cpuid(). 707 */ 708 for (i = 1; entry->ecx & 0xff00; ++i) { 709 entry = do_host_cpuid(array, function, i); 710 if (!entry) 711 goto out; 712 } 713 break; 714 case 0xd: 715 entry->eax &= supported_xcr0; 716 entry->ebx = xstate_required_size(supported_xcr0, false); 717 entry->ecx = entry->ebx; 718 entry->edx &= supported_xcr0 >> 32; 719 if (!supported_xcr0) 720 break; 721 722 entry = do_host_cpuid(array, function, 1); 723 if (!entry) 724 goto out; 725 726 cpuid_entry_override(entry, CPUID_D_1_EAX); 727 if (entry->eax & (F(XSAVES)|F(XSAVEC))) 728 entry->ebx = xstate_required_size(supported_xcr0 | supported_xss, 729 true); 730 else { 731 WARN_ON_ONCE(supported_xss != 0); 732 entry->ebx = 0; 733 } 734 entry->ecx &= supported_xss; 735 entry->edx &= supported_xss >> 32; 736 737 for (i = 2; i < 64; ++i) { 738 bool s_state; 739 if (supported_xcr0 & BIT_ULL(i)) 740 s_state = false; 741 else if (supported_xss & BIT_ULL(i)) 742 s_state = true; 743 else 744 continue; 745 746 entry = do_host_cpuid(array, function, i); 747 if (!entry) 748 goto out; 749 750 /* 751 * The supported check above should have filtered out 752 * invalid sub-leafs. Only valid sub-leafs should 753 * reach this point, and they should have a non-zero 754 * save state size. Furthermore, check whether the 755 * processor agrees with supported_xcr0/supported_xss 756 * on whether this is an XCR0- or IA32_XSS-managed area. 757 */ 758 if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { 759 --array->nent; 760 continue; 761 } 762 entry->edx = 0; 763 } 764 break; 765 /* Intel PT */ 766 case 0x14: 767 if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) { 768 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 769 break; 770 } 771 772 for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { 773 if (!do_host_cpuid(array, function, i)) 774 goto out; 775 } 776 break; 777 case KVM_CPUID_SIGNATURE: { 778 static const char signature[12] = "KVMKVMKVM\0\0"; 779 const u32 *sigptr = (const u32 *)signature; 780 entry->eax = KVM_CPUID_FEATURES; 781 entry->ebx = sigptr[0]; 782 entry->ecx = sigptr[1]; 783 entry->edx = sigptr[2]; 784 break; 785 } 786 case KVM_CPUID_FEATURES: 787 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | 788 (1 << KVM_FEATURE_NOP_IO_DELAY) | 789 (1 << KVM_FEATURE_CLOCKSOURCE2) | 790 (1 << KVM_FEATURE_ASYNC_PF) | 791 (1 << KVM_FEATURE_PV_EOI) | 792 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | 793 (1 << KVM_FEATURE_PV_UNHALT) | 794 (1 << KVM_FEATURE_PV_TLB_FLUSH) | 795 (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | 796 (1 << KVM_FEATURE_PV_SEND_IPI) | 797 (1 << KVM_FEATURE_POLL_CONTROL) | 798 (1 << KVM_FEATURE_PV_SCHED_YIELD) | 799 (1 << KVM_FEATURE_ASYNC_PF_INT); 800 801 if (sched_info_on()) 802 entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); 803 804 entry->ebx = 0; 805 entry->ecx = 0; 806 entry->edx = 0; 807 break; 808 case 0x80000000: 809 entry->eax = min(entry->eax, 0x8000001f); 810 break; 811 case 0x80000001: 812 cpuid_entry_override(entry, CPUID_8000_0001_EDX); 813 cpuid_entry_override(entry, CPUID_8000_0001_ECX); 814 break; 815 case 0x80000006: 816 /* L2 cache and TLB: pass through host info. */ 817 break; 818 case 0x80000007: /* Advanced power management */ 819 /* invariant TSC is CPUID.80000007H:EDX[8] */ 820 entry->edx &= (1 << 8); 821 /* mask against host */ 822 entry->edx &= boot_cpu_data.x86_power; 823 entry->eax = entry->ebx = entry->ecx = 0; 824 break; 825 case 0x80000008: { 826 unsigned g_phys_as = (entry->eax >> 16) & 0xff; 827 unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); 828 unsigned phys_as = entry->eax & 0xff; 829 830 if (!g_phys_as) 831 g_phys_as = phys_as; 832 entry->eax = g_phys_as | (virt_as << 8); 833 entry->edx = 0; 834 cpuid_entry_override(entry, CPUID_8000_0008_EBX); 835 break; 836 } 837 case 0x8000000A: 838 if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) { 839 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 840 break; 841 } 842 entry->eax = 1; /* SVM revision 1 */ 843 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper 844 ASID emulation to nested SVM */ 845 entry->ecx = 0; /* Reserved */ 846 cpuid_entry_override(entry, CPUID_8000_000A_EDX); 847 break; 848 case 0x80000019: 849 entry->ecx = entry->edx = 0; 850 break; 851 case 0x8000001a: 852 case 0x8000001e: 853 break; 854 /* Support memory encryption cpuid if host supports it */ 855 case 0x8000001F: 856 if (!boot_cpu_has(X86_FEATURE_SEV)) 857 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 858 break; 859 /*Add support for Centaur's CPUID instruction*/ 860 case 0xC0000000: 861 /*Just support up to 0xC0000004 now*/ 862 entry->eax = min(entry->eax, 0xC0000004); 863 break; 864 case 0xC0000001: 865 cpuid_entry_override(entry, CPUID_C000_0001_EDX); 866 break; 867 case 3: /* Processor serial number */ 868 case 5: /* MONITOR/MWAIT */ 869 case 0xC0000002: 870 case 0xC0000003: 871 case 0xC0000004: 872 default: 873 entry->eax = entry->ebx = entry->ecx = entry->edx = 0; 874 break; 875 } 876 877 r = 0; 878 879 out: 880 put_cpu(); 881 882 return r; 883 } 884 885 static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func, 886 unsigned int type) 887 { 888 if (type == KVM_GET_EMULATED_CPUID) 889 return __do_cpuid_func_emulated(array, func); 890 891 return __do_cpuid_func(array, func); 892 } 893 894 #define CENTAUR_CPUID_SIGNATURE 0xC0000000 895 896 static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func, 897 unsigned int type) 898 { 899 u32 limit; 900 int r; 901 902 if (func == CENTAUR_CPUID_SIGNATURE && 903 boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) 904 return 0; 905 906 r = do_cpuid_func(array, func, type); 907 if (r) 908 return r; 909 910 limit = array->entries[array->nent - 1].eax; 911 for (func = func + 1; func <= limit; ++func) { 912 r = do_cpuid_func(array, func, type); 913 if (r) 914 break; 915 } 916 917 return r; 918 } 919 920 static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, 921 __u32 num_entries, unsigned int ioctl_type) 922 { 923 int i; 924 __u32 pad[3]; 925 926 if (ioctl_type != KVM_GET_EMULATED_CPUID) 927 return false; 928 929 /* 930 * We want to make sure that ->padding is being passed clean from 931 * userspace in case we want to use it for something in the future. 932 * 933 * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we 934 * have to give ourselves satisfied only with the emulated side. /me 935 * sheds a tear. 936 */ 937 for (i = 0; i < num_entries; i++) { 938 if (copy_from_user(pad, entries[i].padding, sizeof(pad))) 939 return true; 940 941 if (pad[0] || pad[1] || pad[2]) 942 return true; 943 } 944 return false; 945 } 946 947 int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, 948 struct kvm_cpuid_entry2 __user *entries, 949 unsigned int type) 950 { 951 static const u32 funcs[] = { 952 0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE, 953 }; 954 955 struct kvm_cpuid_array array = { 956 .nent = 0, 957 }; 958 int r, i; 959 960 if (cpuid->nent < 1) 961 return -E2BIG; 962 if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) 963 cpuid->nent = KVM_MAX_CPUID_ENTRIES; 964 965 if (sanity_check_entries(entries, cpuid->nent, type)) 966 return -EINVAL; 967 968 array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2), 969 cpuid->nent)); 970 if (!array.entries) 971 return -ENOMEM; 972 973 array.maxnent = cpuid->nent; 974 975 for (i = 0; i < ARRAY_SIZE(funcs); i++) { 976 r = get_cpuid_func(&array, funcs[i], type); 977 if (r) 978 goto out_free; 979 } 980 cpuid->nent = array.nent; 981 982 if (copy_to_user(entries, array.entries, 983 array.nent * sizeof(struct kvm_cpuid_entry2))) 984 r = -EFAULT; 985 986 out_free: 987 vfree(array.entries); 988 return r; 989 } 990 991 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 992 u32 function, u32 index) 993 { 994 return cpuid_entry2_find(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent, 995 function, index); 996 } 997 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); 998 999 /* 1000 * Intel CPUID semantics treats any query for an out-of-range leaf as if the 1001 * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics 1002 * returns all zeroes for any undefined leaf, whether or not the leaf is in 1003 * range. Centaur/VIA follows Intel semantics. 1004 * 1005 * A leaf is considered out-of-range if its function is higher than the maximum 1006 * supported leaf of its associated class or if its associated class does not 1007 * exist. 1008 * 1009 * There are three primary classes to be considered, with their respective 1010 * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive. A primary 1011 * class exists if a guest CPUID entry for its <base> leaf exists. For a given 1012 * class, CPUID.<base>.EAX contains the max supported leaf for the class. 1013 * 1014 * - Basic: 0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff 1015 * - Hypervisor: 0x40000000 - 0x4fffffff 1016 * - Extended: 0x80000000 - 0xbfffffff 1017 * - Centaur: 0xc0000000 - 0xcfffffff 1018 * 1019 * The Hypervisor class is further subdivided into sub-classes that each act as 1020 * their own indepdent class associated with a 0x100 byte range. E.g. if Qemu 1021 * is advertising support for both HyperV and KVM, the resulting Hypervisor 1022 * CPUID sub-classes are: 1023 * 1024 * - HyperV: 0x40000000 - 0x400000ff 1025 * - KVM: 0x40000100 - 0x400001ff 1026 */ 1027 static struct kvm_cpuid_entry2 * 1028 get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) 1029 { 1030 struct kvm_cpuid_entry2 *basic, *class; 1031 u32 function = *fn_ptr; 1032 1033 basic = kvm_find_cpuid_entry(vcpu, 0, 0); 1034 if (!basic) 1035 return NULL; 1036 1037 if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) || 1038 is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx)) 1039 return NULL; 1040 1041 if (function >= 0x40000000 && function <= 0x4fffffff) 1042 class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); 1043 else if (function >= 0xc0000000) 1044 class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); 1045 else 1046 class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); 1047 1048 if (class && function <= class->eax) 1049 return NULL; 1050 1051 /* 1052 * Leaf specific adjustments are also applied when redirecting to the 1053 * max basic entry, e.g. if the max basic leaf is 0xb but there is no 1054 * entry for CPUID.0xb.index (see below), then the output value for EDX 1055 * needs to be pulled from CPUID.0xb.1. 1056 */ 1057 *fn_ptr = basic->eax; 1058 1059 /* 1060 * The class does not exist or the requested function is out of range; 1061 * the effective CPUID entry is the max basic leaf. Note, the index of 1062 * the original requested leaf is observed! 1063 */ 1064 return kvm_find_cpuid_entry(vcpu, basic->eax, index); 1065 } 1066 1067 bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, 1068 u32 *ecx, u32 *edx, bool exact_only) 1069 { 1070 u32 orig_function = *eax, function = *eax, index = *ecx; 1071 struct kvm_cpuid_entry2 *entry; 1072 bool exact, used_max_basic = false; 1073 1074 entry = kvm_find_cpuid_entry(vcpu, function, index); 1075 exact = !!entry; 1076 1077 if (!entry && !exact_only) { 1078 entry = get_out_of_range_cpuid_entry(vcpu, &function, index); 1079 used_max_basic = !!entry; 1080 } 1081 1082 if (entry) { 1083 *eax = entry->eax; 1084 *ebx = entry->ebx; 1085 *ecx = entry->ecx; 1086 *edx = entry->edx; 1087 if (function == 7 && index == 0) { 1088 u64 data; 1089 if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && 1090 (data & TSX_CTRL_CPUID_CLEAR)) 1091 *ebx &= ~(F(RTM) | F(HLE)); 1092 } 1093 } else { 1094 *eax = *ebx = *ecx = *edx = 0; 1095 /* 1096 * When leaf 0BH or 1FH is defined, CL is pass-through 1097 * and EDX is always the x2APIC ID, even for undefined 1098 * subleaves. Index 1 will exist iff the leaf is 1099 * implemented, so we pass through CL iff leaf 1 1100 * exists. EDX can be copied from any existing index. 1101 */ 1102 if (function == 0xb || function == 0x1f) { 1103 entry = kvm_find_cpuid_entry(vcpu, function, 1); 1104 if (entry) { 1105 *ecx = index & 0xff; 1106 *edx = entry->edx; 1107 } 1108 } 1109 } 1110 trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact, 1111 used_max_basic); 1112 return exact; 1113 } 1114 EXPORT_SYMBOL_GPL(kvm_cpuid); 1115 1116 int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 1117 { 1118 u32 eax, ebx, ecx, edx; 1119 1120 if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) 1121 return 1; 1122 1123 eax = kvm_rax_read(vcpu); 1124 ecx = kvm_rcx_read(vcpu); 1125 kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); 1126 kvm_rax_write(vcpu, eax); 1127 kvm_rbx_write(vcpu, ebx); 1128 kvm_rcx_write(vcpu, ecx); 1129 kvm_rdx_write(vcpu, edx); 1130 return kvm_skip_emulated_instruction(vcpu); 1131 } 1132 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 1133