1 /* 2 * QEMU KVM support 3 * 4 * Copyright (C) 2006-2008 Qumranet Technologies 5 * Copyright IBM, Corp. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2 or later. 11 * See the COPYING file in the top-level directory. 12 * 13 */ 14 15 #include "qemu/osdep.h" 16 #include "qapi/qapi-events-run-state.h" 17 #include "qapi/error.h" 18 #include <sys/ioctl.h> 19 #include <sys/utsname.h> 20 21 #include <linux/kvm.h> 22 #include "standard-headers/asm-x86/kvm_para.h" 23 24 #include "cpu.h" 25 #include "sysemu/sysemu.h" 26 #include "sysemu/hw_accel.h" 27 #include "sysemu/kvm_int.h" 28 #include "sysemu/runstate.h" 29 #include "kvm_i386.h" 30 #include "hyperv.h" 31 #include "hyperv-proto.h" 32 33 #include "exec/gdbstub.h" 34 #include "qemu/host-utils.h" 35 #include "qemu/main-loop.h" 36 #include "qemu/config-file.h" 37 #include "qemu/error-report.h" 38 #include "hw/i386/x86.h" 39 #include "hw/i386/apic.h" 40 #include "hw/i386/apic_internal.h" 41 #include "hw/i386/apic-msidef.h" 42 #include "hw/i386/intel_iommu.h" 43 #include "hw/i386/x86-iommu.h" 44 #include "hw/i386/e820_memory_layout.h" 45 #include "sysemu/sev.h" 46 47 #include "hw/pci/pci.h" 48 #include "hw/pci/msi.h" 49 #include "hw/pci/msix.h" 50 #include "migration/blocker.h" 51 #include "exec/memattrs.h" 52 #include "trace.h" 53 54 //#define DEBUG_KVM 55 56 #ifdef DEBUG_KVM 57 #define DPRINTF(fmt, ...) \ 58 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 59 #else 60 #define DPRINTF(fmt, ...) \ 61 do { } while (0) 62 #endif 63 64 /* From arch/x86/kvm/lapic.h */ 65 #define KVM_APIC_BUS_CYCLE_NS 1 66 #define KVM_APIC_BUS_FREQUENCY (1000000000ULL / KVM_APIC_BUS_CYCLE_NS) 67 68 #define MSR_KVM_WALL_CLOCK 0x11 69 #define MSR_KVM_SYSTEM_TIME 0x12 70 71 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus 72 * 255 kvm_msr_entry structs */ 73 #define MSR_BUF_SIZE 4096 74 75 static void kvm_init_msrs(X86CPU *cpu); 76 77 const KVMCapabilityInfo kvm_arch_required_capabilities[] = { 78 KVM_CAP_INFO(SET_TSS_ADDR), 79 KVM_CAP_INFO(EXT_CPUID), 80 KVM_CAP_INFO(MP_STATE), 81 KVM_CAP_LAST_INFO 82 }; 83 84 static bool has_msr_star; 85 static bool has_msr_hsave_pa; 86 static bool has_msr_tsc_aux; 87 static bool has_msr_tsc_adjust; 88 static bool has_msr_tsc_deadline; 89 static bool has_msr_feature_control; 90 static bool has_msr_misc_enable; 91 static bool has_msr_smbase; 92 static bool has_msr_bndcfgs; 93 static int lm_capable_kernel; 94 static bool has_msr_hv_hypercall; 95 static bool has_msr_hv_crash; 96 static bool has_msr_hv_reset; 97 static bool has_msr_hv_vpindex; 98 static bool hv_vpindex_settable; 99 static bool has_msr_hv_runtime; 100 static bool has_msr_hv_synic; 101 static bool has_msr_hv_stimer; 102 static bool has_msr_hv_frequencies; 103 static bool has_msr_hv_reenlightenment; 104 static bool has_msr_xss; 105 static bool has_msr_umwait; 106 static bool has_msr_spec_ctrl; 107 static bool has_msr_tsx_ctrl; 108 static bool has_msr_virt_ssbd; 109 static bool has_msr_smi_count; 110 static bool has_msr_arch_capabs; 111 static bool has_msr_core_capabs; 112 static bool has_msr_vmx_vmfunc; 113 static bool has_msr_ucode_rev; 114 static bool has_msr_vmx_procbased_ctls2; 115 static bool has_msr_perf_capabs; 116 117 static uint32_t has_architectural_pmu_version; 118 static uint32_t num_architectural_pmu_gp_counters; 119 static uint32_t num_architectural_pmu_fixed_counters; 120 121 static int has_xsave; 122 static int has_xcrs; 123 static int has_pit_state2; 124 static int has_exception_payload; 125 126 static bool has_msr_mcg_ext_ctl; 127 128 static struct kvm_cpuid2 *cpuid_cache; 129 static struct kvm_msr_list *kvm_feature_msrs; 130 131 int kvm_has_pit_state2(void) 132 { 133 return has_pit_state2; 134 } 135 136 bool kvm_has_smm(void) 137 { 138 return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM); 139 } 140 141 bool kvm_has_adjust_clock_stable(void) 142 { 143 int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK); 144 145 return (ret == KVM_CLOCK_TSC_STABLE); 146 } 147 148 bool kvm_has_adjust_clock(void) 149 { 150 return kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK); 151 } 152 153 bool kvm_has_exception_payload(void) 154 { 155 return has_exception_payload; 156 } 157 158 static bool kvm_x2apic_api_set_flags(uint64_t flags) 159 { 160 KVMState *s = KVM_STATE(current_accel()); 161 162 return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags); 163 } 164 165 #define MEMORIZE(fn, _result) \ 166 ({ \ 167 static bool _memorized; \ 168 \ 169 if (_memorized) { \ 170 return _result; \ 171 } \ 172 _memorized = true; \ 173 _result = fn; \ 174 }) 175 176 static bool has_x2apic_api; 177 178 bool kvm_has_x2apic_api(void) 179 { 180 return has_x2apic_api; 181 } 182 183 bool kvm_enable_x2apic(void) 184 { 185 return MEMORIZE( 186 kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS | 187 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK), 188 has_x2apic_api); 189 } 190 191 bool kvm_hv_vpindex_settable(void) 192 { 193 return hv_vpindex_settable; 194 } 195 196 static int kvm_get_tsc(CPUState *cs) 197 { 198 X86CPU *cpu = X86_CPU(cs); 199 CPUX86State *env = &cpu->env; 200 struct { 201 struct kvm_msrs info; 202 struct kvm_msr_entry entries[1]; 203 } msr_data = {}; 204 int ret; 205 206 if (env->tsc_valid) { 207 return 0; 208 } 209 210 memset(&msr_data, 0, sizeof(msr_data)); 211 msr_data.info.nmsrs = 1; 212 msr_data.entries[0].index = MSR_IA32_TSC; 213 env->tsc_valid = !runstate_is_running(); 214 215 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data); 216 if (ret < 0) { 217 return ret; 218 } 219 220 assert(ret == 1); 221 env->tsc = msr_data.entries[0].data; 222 return 0; 223 } 224 225 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg) 226 { 227 kvm_get_tsc(cpu); 228 } 229 230 void kvm_synchronize_all_tsc(void) 231 { 232 CPUState *cpu; 233 234 if (kvm_enabled()) { 235 CPU_FOREACH(cpu) { 236 run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL); 237 } 238 } 239 } 240 241 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max) 242 { 243 struct kvm_cpuid2 *cpuid; 244 int r, size; 245 246 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries); 247 cpuid = g_malloc0(size); 248 cpuid->nent = max; 249 r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid); 250 if (r == 0 && cpuid->nent >= max) { 251 r = -E2BIG; 252 } 253 if (r < 0) { 254 if (r == -E2BIG) { 255 g_free(cpuid); 256 return NULL; 257 } else { 258 fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n", 259 strerror(-r)); 260 exit(1); 261 } 262 } 263 return cpuid; 264 } 265 266 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough 267 * for all entries. 268 */ 269 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s) 270 { 271 struct kvm_cpuid2 *cpuid; 272 int max = 1; 273 274 if (cpuid_cache != NULL) { 275 return cpuid_cache; 276 } 277 while ((cpuid = try_get_cpuid(s, max)) == NULL) { 278 max *= 2; 279 } 280 cpuid_cache = cpuid; 281 return cpuid; 282 } 283 284 static bool host_tsx_broken(void) 285 { 286 int family, model, stepping;\ 287 char vendor[CPUID_VENDOR_SZ + 1]; 288 289 host_vendor_fms(vendor, &family, &model, &stepping); 290 291 /* Check if we are running on a Haswell host known to have broken TSX */ 292 return !strcmp(vendor, CPUID_VENDOR_INTEL) && 293 (family == 6) && 294 ((model == 63 && stepping < 4) || 295 model == 60 || model == 69 || model == 70); 296 } 297 298 /* Returns the value for a specific register on the cpuid entry 299 */ 300 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg) 301 { 302 uint32_t ret = 0; 303 switch (reg) { 304 case R_EAX: 305 ret = entry->eax; 306 break; 307 case R_EBX: 308 ret = entry->ebx; 309 break; 310 case R_ECX: 311 ret = entry->ecx; 312 break; 313 case R_EDX: 314 ret = entry->edx; 315 break; 316 } 317 return ret; 318 } 319 320 /* Find matching entry for function/index on kvm_cpuid2 struct 321 */ 322 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid, 323 uint32_t function, 324 uint32_t index) 325 { 326 int i; 327 for (i = 0; i < cpuid->nent; ++i) { 328 if (cpuid->entries[i].function == function && 329 cpuid->entries[i].index == index) { 330 return &cpuid->entries[i]; 331 } 332 } 333 /* not found: */ 334 return NULL; 335 } 336 337 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, 338 uint32_t index, int reg) 339 { 340 struct kvm_cpuid2 *cpuid; 341 uint32_t ret = 0; 342 uint32_t cpuid_1_edx; 343 344 cpuid = get_supported_cpuid(s); 345 346 struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index); 347 if (entry) { 348 ret = cpuid_entry_get_reg(entry, reg); 349 } 350 351 /* Fixups for the data returned by KVM, below */ 352 353 if (function == 1 && reg == R_EDX) { 354 /* KVM before 2.6.30 misreports the following features */ 355 ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA; 356 } else if (function == 1 && reg == R_ECX) { 357 /* We can set the hypervisor flag, even if KVM does not return it on 358 * GET_SUPPORTED_CPUID 359 */ 360 ret |= CPUID_EXT_HYPERVISOR; 361 /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it 362 * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER, 363 * and the irqchip is in the kernel. 364 */ 365 if (kvm_irqchip_in_kernel() && 366 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) { 367 ret |= CPUID_EXT_TSC_DEADLINE_TIMER; 368 } 369 370 /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled 371 * without the in-kernel irqchip 372 */ 373 if (!kvm_irqchip_in_kernel()) { 374 ret &= ~CPUID_EXT_X2APIC; 375 } 376 377 if (enable_cpu_pm) { 378 int disable_exits = kvm_check_extension(s, 379 KVM_CAP_X86_DISABLE_EXITS); 380 381 if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) { 382 ret |= CPUID_EXT_MONITOR; 383 } 384 } 385 } else if (function == 6 && reg == R_EAX) { 386 ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */ 387 } else if (function == 7 && index == 0 && reg == R_EBX) { 388 if (host_tsx_broken()) { 389 ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE); 390 } 391 } else if (function == 7 && index == 0 && reg == R_EDX) { 392 /* 393 * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts. 394 * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is 395 * returned by KVM_GET_MSR_INDEX_LIST. 396 */ 397 if (!has_msr_arch_capabs) { 398 ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES; 399 } 400 } else if (function == 0x80000001 && reg == R_ECX) { 401 /* 402 * It's safe to enable TOPOEXT even if it's not returned by 403 * GET_SUPPORTED_CPUID. Unconditionally enabling TOPOEXT here allows 404 * us to keep CPU models including TOPOEXT runnable on older kernels. 405 */ 406 ret |= CPUID_EXT3_TOPOEXT; 407 } else if (function == 0x80000001 && reg == R_EDX) { 408 /* On Intel, kvm returns cpuid according to the Intel spec, 409 * so add missing bits according to the AMD spec: 410 */ 411 cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX); 412 ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES; 413 } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) { 414 /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't 415 * be enabled without the in-kernel irqchip 416 */ 417 if (!kvm_irqchip_in_kernel()) { 418 ret &= ~(1U << KVM_FEATURE_PV_UNHALT); 419 } 420 if (kvm_irqchip_is_split()) { 421 ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID; 422 } 423 } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) { 424 ret |= 1U << KVM_HINTS_REALTIME; 425 } 426 427 return ret; 428 } 429 430 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index) 431 { 432 struct { 433 struct kvm_msrs info; 434 struct kvm_msr_entry entries[1]; 435 } msr_data = {}; 436 uint64_t value; 437 uint32_t ret, can_be_one, must_be_one; 438 439 if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */ 440 return 0; 441 } 442 443 /* Check if requested MSR is supported feature MSR */ 444 int i; 445 for (i = 0; i < kvm_feature_msrs->nmsrs; i++) 446 if (kvm_feature_msrs->indices[i] == index) { 447 break; 448 } 449 if (i == kvm_feature_msrs->nmsrs) { 450 return 0; /* if the feature MSR is not supported, simply return 0 */ 451 } 452 453 msr_data.info.nmsrs = 1; 454 msr_data.entries[0].index = index; 455 456 ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data); 457 if (ret != 1) { 458 error_report("KVM get MSR (index=0x%x) feature failed, %s", 459 index, strerror(-ret)); 460 exit(1); 461 } 462 463 value = msr_data.entries[0].data; 464 switch (index) { 465 case MSR_IA32_VMX_PROCBASED_CTLS2: 466 if (!has_msr_vmx_procbased_ctls2) { 467 /* KVM forgot to add these bits for some time, do this ourselves. */ 468 if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) & 469 CPUID_XSAVE_XSAVES) { 470 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32; 471 } 472 if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) & 473 CPUID_EXT_RDRAND) { 474 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32; 475 } 476 if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & 477 CPUID_7_0_EBX_INVPCID) { 478 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32; 479 } 480 if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) & 481 CPUID_7_0_EBX_RDSEED) { 482 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32; 483 } 484 if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) & 485 CPUID_EXT2_RDTSCP) { 486 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32; 487 } 488 } 489 /* fall through */ 490 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 491 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 492 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 493 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 494 /* 495 * Return true for bits that can be one, but do not have to be one. 496 * The SDM tells us which bits could have a "must be one" setting, 497 * so we can do the opposite transformation in make_vmx_msr_value. 498 */ 499 must_be_one = (uint32_t)value; 500 can_be_one = (uint32_t)(value >> 32); 501 return can_be_one & ~must_be_one; 502 503 default: 504 return value; 505 } 506 } 507 508 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap, 509 int *max_banks) 510 { 511 int r; 512 513 r = kvm_check_extension(s, KVM_CAP_MCE); 514 if (r > 0) { 515 *max_banks = r; 516 return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap); 517 } 518 return -ENOSYS; 519 } 520 521 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code) 522 { 523 CPUState *cs = CPU(cpu); 524 CPUX86State *env = &cpu->env; 525 uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN | 526 MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S; 527 uint64_t mcg_status = MCG_STATUS_MCIP; 528 int flags = 0; 529 530 if (code == BUS_MCEERR_AR) { 531 status |= MCI_STATUS_AR | 0x134; 532 mcg_status |= MCG_STATUS_EIPV; 533 } else { 534 status |= 0xc0; 535 mcg_status |= MCG_STATUS_RIPV; 536 } 537 538 flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0; 539 /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the 540 * guest kernel back into env->mcg_ext_ctl. 541 */ 542 cpu_synchronize_state(cs); 543 if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) { 544 mcg_status |= MCG_STATUS_LMCE; 545 flags = 0; 546 } 547 548 cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr, 549 (MCM_ADDR_PHYS << 6) | 0xc, flags); 550 } 551 552 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar) 553 { 554 MemoryFailureFlags mff = {.action_required = ar, .recursive = false}; 555 556 qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action, 557 &mff); 558 } 559 560 static void hardware_memory_error(void *host_addr) 561 { 562 emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true); 563 error_report("QEMU got Hardware memory error at addr %p", host_addr); 564 exit(1); 565 } 566 567 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) 568 { 569 X86CPU *cpu = X86_CPU(c); 570 CPUX86State *env = &cpu->env; 571 ram_addr_t ram_addr; 572 hwaddr paddr; 573 574 /* If we get an action required MCE, it has been injected by KVM 575 * while the VM was running. An action optional MCE instead should 576 * be coming from the main thread, which qemu_init_sigbus identifies 577 * as the "early kill" thread. 578 */ 579 assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO); 580 581 if ((env->mcg_cap & MCG_SER_P) && addr) { 582 ram_addr = qemu_ram_addr_from_host(addr); 583 if (ram_addr != RAM_ADDR_INVALID && 584 kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { 585 kvm_hwpoison_page_add(ram_addr); 586 kvm_mce_inject(cpu, paddr, code); 587 588 /* 589 * Use different logging severity based on error type. 590 * If there is additional MCE reporting on the hypervisor, QEMU VA 591 * could be another source to identify the PA and MCE details. 592 */ 593 if (code == BUS_MCEERR_AR) { 594 error_report("Guest MCE Memory Error at QEMU addr %p and " 595 "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", 596 addr, paddr, "BUS_MCEERR_AR"); 597 } else { 598 warn_report("Guest MCE Memory Error at QEMU addr %p and " 599 "GUEST addr 0x%" HWADDR_PRIx " of type %s injected", 600 addr, paddr, "BUS_MCEERR_AO"); 601 } 602 603 return; 604 } 605 606 if (code == BUS_MCEERR_AO) { 607 warn_report("Hardware memory error at addr %p of type %s " 608 "for memory used by QEMU itself instead of guest system!", 609 addr, "BUS_MCEERR_AO"); 610 } 611 } 612 613 if (code == BUS_MCEERR_AR) { 614 hardware_memory_error(addr); 615 } 616 617 /* Hope we are lucky for AO MCE, just notify a event */ 618 emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false); 619 } 620 621 static void kvm_reset_exception(CPUX86State *env) 622 { 623 env->exception_nr = -1; 624 env->exception_pending = 0; 625 env->exception_injected = 0; 626 env->exception_has_payload = false; 627 env->exception_payload = 0; 628 } 629 630 static void kvm_queue_exception(CPUX86State *env, 631 int32_t exception_nr, 632 uint8_t exception_has_payload, 633 uint64_t exception_payload) 634 { 635 assert(env->exception_nr == -1); 636 assert(!env->exception_pending); 637 assert(!env->exception_injected); 638 assert(!env->exception_has_payload); 639 640 env->exception_nr = exception_nr; 641 642 if (has_exception_payload) { 643 env->exception_pending = 1; 644 645 env->exception_has_payload = exception_has_payload; 646 env->exception_payload = exception_payload; 647 } else { 648 env->exception_injected = 1; 649 650 if (exception_nr == EXCP01_DB) { 651 assert(exception_has_payload); 652 env->dr[6] = exception_payload; 653 } else if (exception_nr == EXCP0E_PAGE) { 654 assert(exception_has_payload); 655 env->cr[2] = exception_payload; 656 } else { 657 assert(!exception_has_payload); 658 } 659 } 660 } 661 662 static int kvm_inject_mce_oldstyle(X86CPU *cpu) 663 { 664 CPUX86State *env = &cpu->env; 665 666 if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) { 667 unsigned int bank, bank_num = env->mcg_cap & 0xff; 668 struct kvm_x86_mce mce; 669 670 kvm_reset_exception(env); 671 672 /* 673 * There must be at least one bank in use if an MCE is pending. 674 * Find it and use its values for the event injection. 675 */ 676 for (bank = 0; bank < bank_num; bank++) { 677 if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) { 678 break; 679 } 680 } 681 assert(bank < bank_num); 682 683 mce.bank = bank; 684 mce.status = env->mce_banks[bank * 4 + 1]; 685 mce.mcg_status = env->mcg_status; 686 mce.addr = env->mce_banks[bank * 4 + 2]; 687 mce.misc = env->mce_banks[bank * 4 + 3]; 688 689 return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce); 690 } 691 return 0; 692 } 693 694 static void cpu_update_state(void *opaque, int running, RunState state) 695 { 696 CPUX86State *env = opaque; 697 698 if (running) { 699 env->tsc_valid = false; 700 } 701 } 702 703 unsigned long kvm_arch_vcpu_id(CPUState *cs) 704 { 705 X86CPU *cpu = X86_CPU(cs); 706 return cpu->apic_id; 707 } 708 709 #ifndef KVM_CPUID_SIGNATURE_NEXT 710 #define KVM_CPUID_SIGNATURE_NEXT 0x40000100 711 #endif 712 713 static bool hyperv_enabled(X86CPU *cpu) 714 { 715 CPUState *cs = CPU(cpu); 716 return kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0 && 717 ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) || 718 cpu->hyperv_features || cpu->hyperv_passthrough); 719 } 720 721 /* 722 * Check whether target_freq is within conservative 723 * ntp correctable bounds (250ppm) of freq 724 */ 725 static inline bool freq_within_bounds(int freq, int target_freq) 726 { 727 int max_freq = freq + (freq * 250 / 1000000); 728 int min_freq = freq - (freq * 250 / 1000000); 729 730 if (target_freq >= min_freq && target_freq <= max_freq) { 731 return true; 732 } 733 734 return false; 735 } 736 737 static int kvm_arch_set_tsc_khz(CPUState *cs) 738 { 739 X86CPU *cpu = X86_CPU(cs); 740 CPUX86State *env = &cpu->env; 741 int r, cur_freq; 742 bool set_ioctl = false; 743 744 if (!env->tsc_khz) { 745 return 0; 746 } 747 748 cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? 749 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP; 750 751 /* 752 * If TSC scaling is supported, attempt to set TSC frequency. 753 */ 754 if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) { 755 set_ioctl = true; 756 } 757 758 /* 759 * If desired TSC frequency is within bounds of NTP correction, 760 * attempt to set TSC frequency. 761 */ 762 if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) { 763 set_ioctl = true; 764 } 765 766 r = set_ioctl ? 767 kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) : 768 -ENOTSUP; 769 770 if (r < 0) { 771 /* When KVM_SET_TSC_KHZ fails, it's an error only if the current 772 * TSC frequency doesn't match the one we want. 773 */ 774 cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? 775 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : 776 -ENOTSUP; 777 if (cur_freq <= 0 || cur_freq != env->tsc_khz) { 778 warn_report("TSC frequency mismatch between " 779 "VM (%" PRId64 " kHz) and host (%d kHz), " 780 "and TSC scaling unavailable", 781 env->tsc_khz, cur_freq); 782 return r; 783 } 784 } 785 786 return 0; 787 } 788 789 static bool tsc_is_stable_and_known(CPUX86State *env) 790 { 791 if (!env->tsc_khz) { 792 return false; 793 } 794 return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) 795 || env->user_tsc_khz; 796 } 797 798 static struct { 799 const char *desc; 800 struct { 801 uint32_t fw; 802 uint32_t bits; 803 } flags[2]; 804 uint64_t dependencies; 805 } kvm_hyperv_properties[] = { 806 [HYPERV_FEAT_RELAXED] = { 807 .desc = "relaxed timing (hv-relaxed)", 808 .flags = { 809 {.fw = FEAT_HYPERV_EAX, 810 .bits = HV_HYPERCALL_AVAILABLE}, 811 {.fw = FEAT_HV_RECOMM_EAX, 812 .bits = HV_RELAXED_TIMING_RECOMMENDED} 813 } 814 }, 815 [HYPERV_FEAT_VAPIC] = { 816 .desc = "virtual APIC (hv-vapic)", 817 .flags = { 818 {.fw = FEAT_HYPERV_EAX, 819 .bits = HV_HYPERCALL_AVAILABLE | HV_APIC_ACCESS_AVAILABLE}, 820 {.fw = FEAT_HV_RECOMM_EAX, 821 .bits = HV_APIC_ACCESS_RECOMMENDED} 822 } 823 }, 824 [HYPERV_FEAT_TIME] = { 825 .desc = "clocksources (hv-time)", 826 .flags = { 827 {.fw = FEAT_HYPERV_EAX, 828 .bits = HV_HYPERCALL_AVAILABLE | HV_TIME_REF_COUNT_AVAILABLE | 829 HV_REFERENCE_TSC_AVAILABLE} 830 } 831 }, 832 [HYPERV_FEAT_CRASH] = { 833 .desc = "crash MSRs (hv-crash)", 834 .flags = { 835 {.fw = FEAT_HYPERV_EDX, 836 .bits = HV_GUEST_CRASH_MSR_AVAILABLE} 837 } 838 }, 839 [HYPERV_FEAT_RESET] = { 840 .desc = "reset MSR (hv-reset)", 841 .flags = { 842 {.fw = FEAT_HYPERV_EAX, 843 .bits = HV_RESET_AVAILABLE} 844 } 845 }, 846 [HYPERV_FEAT_VPINDEX] = { 847 .desc = "VP_INDEX MSR (hv-vpindex)", 848 .flags = { 849 {.fw = FEAT_HYPERV_EAX, 850 .bits = HV_VP_INDEX_AVAILABLE} 851 } 852 }, 853 [HYPERV_FEAT_RUNTIME] = { 854 .desc = "VP_RUNTIME MSR (hv-runtime)", 855 .flags = { 856 {.fw = FEAT_HYPERV_EAX, 857 .bits = HV_VP_RUNTIME_AVAILABLE} 858 } 859 }, 860 [HYPERV_FEAT_SYNIC] = { 861 .desc = "synthetic interrupt controller (hv-synic)", 862 .flags = { 863 {.fw = FEAT_HYPERV_EAX, 864 .bits = HV_SYNIC_AVAILABLE} 865 } 866 }, 867 [HYPERV_FEAT_STIMER] = { 868 .desc = "synthetic timers (hv-stimer)", 869 .flags = { 870 {.fw = FEAT_HYPERV_EAX, 871 .bits = HV_SYNTIMERS_AVAILABLE} 872 }, 873 .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME) 874 }, 875 [HYPERV_FEAT_FREQUENCIES] = { 876 .desc = "frequency MSRs (hv-frequencies)", 877 .flags = { 878 {.fw = FEAT_HYPERV_EAX, 879 .bits = HV_ACCESS_FREQUENCY_MSRS}, 880 {.fw = FEAT_HYPERV_EDX, 881 .bits = HV_FREQUENCY_MSRS_AVAILABLE} 882 } 883 }, 884 [HYPERV_FEAT_REENLIGHTENMENT] = { 885 .desc = "reenlightenment MSRs (hv-reenlightenment)", 886 .flags = { 887 {.fw = FEAT_HYPERV_EAX, 888 .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL} 889 } 890 }, 891 [HYPERV_FEAT_TLBFLUSH] = { 892 .desc = "paravirtualized TLB flush (hv-tlbflush)", 893 .flags = { 894 {.fw = FEAT_HV_RECOMM_EAX, 895 .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED | 896 HV_EX_PROCESSOR_MASKS_RECOMMENDED} 897 }, 898 .dependencies = BIT(HYPERV_FEAT_VPINDEX) 899 }, 900 [HYPERV_FEAT_EVMCS] = { 901 .desc = "enlightened VMCS (hv-evmcs)", 902 .flags = { 903 {.fw = FEAT_HV_RECOMM_EAX, 904 .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED} 905 }, 906 .dependencies = BIT(HYPERV_FEAT_VAPIC) 907 }, 908 [HYPERV_FEAT_IPI] = { 909 .desc = "paravirtualized IPI (hv-ipi)", 910 .flags = { 911 {.fw = FEAT_HV_RECOMM_EAX, 912 .bits = HV_CLUSTER_IPI_RECOMMENDED | 913 HV_EX_PROCESSOR_MASKS_RECOMMENDED} 914 }, 915 .dependencies = BIT(HYPERV_FEAT_VPINDEX) 916 }, 917 [HYPERV_FEAT_STIMER_DIRECT] = { 918 .desc = "direct mode synthetic timers (hv-stimer-direct)", 919 .flags = { 920 {.fw = FEAT_HYPERV_EDX, 921 .bits = HV_STIMER_DIRECT_MODE_AVAILABLE} 922 }, 923 .dependencies = BIT(HYPERV_FEAT_STIMER) 924 }, 925 }; 926 927 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max) 928 { 929 struct kvm_cpuid2 *cpuid; 930 int r, size; 931 932 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries); 933 cpuid = g_malloc0(size); 934 cpuid->nent = max; 935 936 r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid); 937 if (r == 0 && cpuid->nent >= max) { 938 r = -E2BIG; 939 } 940 if (r < 0) { 941 if (r == -E2BIG) { 942 g_free(cpuid); 943 return NULL; 944 } else { 945 fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n", 946 strerror(-r)); 947 exit(1); 948 } 949 } 950 return cpuid; 951 } 952 953 /* 954 * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough 955 * for all entries. 956 */ 957 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs) 958 { 959 struct kvm_cpuid2 *cpuid; 960 int max = 7; /* 0x40000000..0x40000005, 0x4000000A */ 961 962 /* 963 * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with 964 * -E2BIG, however, it doesn't report back the right size. Keep increasing 965 * it and re-trying until we succeed. 966 */ 967 while ((cpuid = try_get_hv_cpuid(cs, max)) == NULL) { 968 max++; 969 } 970 return cpuid; 971 } 972 973 /* 974 * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature 975 * leaves from KVM_CAP_HYPERV* and present MSRs data. 976 */ 977 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs) 978 { 979 X86CPU *cpu = X86_CPU(cs); 980 struct kvm_cpuid2 *cpuid; 981 struct kvm_cpuid_entry2 *entry_feat, *entry_recomm; 982 983 /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */ 984 cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries)); 985 cpuid->nent = 2; 986 987 /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */ 988 entry_feat = &cpuid->entries[0]; 989 entry_feat->function = HV_CPUID_FEATURES; 990 991 entry_recomm = &cpuid->entries[1]; 992 entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO; 993 entry_recomm->ebx = cpu->hyperv_spinlock_attempts; 994 995 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) { 996 entry_feat->eax |= HV_HYPERCALL_AVAILABLE; 997 entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE; 998 entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE; 999 entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED; 1000 entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED; 1001 } 1002 1003 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) { 1004 entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE; 1005 entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE; 1006 } 1007 1008 if (has_msr_hv_frequencies) { 1009 entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS; 1010 entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE; 1011 } 1012 1013 if (has_msr_hv_crash) { 1014 entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE; 1015 } 1016 1017 if (has_msr_hv_reenlightenment) { 1018 entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL; 1019 } 1020 1021 if (has_msr_hv_reset) { 1022 entry_feat->eax |= HV_RESET_AVAILABLE; 1023 } 1024 1025 if (has_msr_hv_vpindex) { 1026 entry_feat->eax |= HV_VP_INDEX_AVAILABLE; 1027 } 1028 1029 if (has_msr_hv_runtime) { 1030 entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE; 1031 } 1032 1033 if (has_msr_hv_synic) { 1034 unsigned int cap = cpu->hyperv_synic_kvm_only ? 1035 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2; 1036 1037 if (kvm_check_extension(cs->kvm_state, cap) > 0) { 1038 entry_feat->eax |= HV_SYNIC_AVAILABLE; 1039 } 1040 } 1041 1042 if (has_msr_hv_stimer) { 1043 entry_feat->eax |= HV_SYNTIMERS_AVAILABLE; 1044 } 1045 1046 if (kvm_check_extension(cs->kvm_state, 1047 KVM_CAP_HYPERV_TLBFLUSH) > 0) { 1048 entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED; 1049 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED; 1050 } 1051 1052 if (kvm_check_extension(cs->kvm_state, 1053 KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) { 1054 entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED; 1055 } 1056 1057 if (kvm_check_extension(cs->kvm_state, 1058 KVM_CAP_HYPERV_SEND_IPI) > 0) { 1059 entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED; 1060 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED; 1061 } 1062 1063 return cpuid; 1064 } 1065 1066 static int hv_cpuid_get_fw(struct kvm_cpuid2 *cpuid, int fw, uint32_t *r) 1067 { 1068 struct kvm_cpuid_entry2 *entry; 1069 uint32_t func; 1070 int reg; 1071 1072 switch (fw) { 1073 case FEAT_HYPERV_EAX: 1074 reg = R_EAX; 1075 func = HV_CPUID_FEATURES; 1076 break; 1077 case FEAT_HYPERV_EDX: 1078 reg = R_EDX; 1079 func = HV_CPUID_FEATURES; 1080 break; 1081 case FEAT_HV_RECOMM_EAX: 1082 reg = R_EAX; 1083 func = HV_CPUID_ENLIGHTMENT_INFO; 1084 break; 1085 default: 1086 return -EINVAL; 1087 } 1088 1089 entry = cpuid_find_entry(cpuid, func, 0); 1090 if (!entry) { 1091 return -ENOENT; 1092 } 1093 1094 switch (reg) { 1095 case R_EAX: 1096 *r = entry->eax; 1097 break; 1098 case R_EDX: 1099 *r = entry->edx; 1100 break; 1101 default: 1102 return -EINVAL; 1103 } 1104 1105 return 0; 1106 } 1107 1108 static int hv_cpuid_check_and_set(CPUState *cs, struct kvm_cpuid2 *cpuid, 1109 int feature) 1110 { 1111 X86CPU *cpu = X86_CPU(cs); 1112 CPUX86State *env = &cpu->env; 1113 uint32_t r, fw, bits; 1114 uint64_t deps; 1115 int i, dep_feat; 1116 1117 if (!hyperv_feat_enabled(cpu, feature) && !cpu->hyperv_passthrough) { 1118 return 0; 1119 } 1120 1121 deps = kvm_hyperv_properties[feature].dependencies; 1122 while (deps) { 1123 dep_feat = ctz64(deps); 1124 if (!(hyperv_feat_enabled(cpu, dep_feat))) { 1125 fprintf(stderr, 1126 "Hyper-V %s requires Hyper-V %s\n", 1127 kvm_hyperv_properties[feature].desc, 1128 kvm_hyperv_properties[dep_feat].desc); 1129 return 1; 1130 } 1131 deps &= ~(1ull << dep_feat); 1132 } 1133 1134 for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) { 1135 fw = kvm_hyperv_properties[feature].flags[i].fw; 1136 bits = kvm_hyperv_properties[feature].flags[i].bits; 1137 1138 if (!fw) { 1139 continue; 1140 } 1141 1142 if (hv_cpuid_get_fw(cpuid, fw, &r) || (r & bits) != bits) { 1143 if (hyperv_feat_enabled(cpu, feature)) { 1144 fprintf(stderr, 1145 "Hyper-V %s is not supported by kernel\n", 1146 kvm_hyperv_properties[feature].desc); 1147 return 1; 1148 } else { 1149 return 0; 1150 } 1151 } 1152 1153 env->features[fw] |= bits; 1154 } 1155 1156 if (cpu->hyperv_passthrough) { 1157 cpu->hyperv_features |= BIT(feature); 1158 } 1159 1160 return 0; 1161 } 1162 1163 /* 1164 * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent in 1165 * case of success, errno < 0 in case of failure and 0 when no Hyper-V 1166 * extentions are enabled. 1167 */ 1168 static int hyperv_handle_properties(CPUState *cs, 1169 struct kvm_cpuid_entry2 *cpuid_ent) 1170 { 1171 X86CPU *cpu = X86_CPU(cs); 1172 CPUX86State *env = &cpu->env; 1173 struct kvm_cpuid2 *cpuid; 1174 struct kvm_cpuid_entry2 *c; 1175 uint32_t cpuid_i = 0; 1176 int r; 1177 1178 if (!hyperv_enabled(cpu)) 1179 return 0; 1180 1181 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) || 1182 cpu->hyperv_passthrough) { 1183 uint16_t evmcs_version; 1184 1185 r = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0, 1186 (uintptr_t)&evmcs_version); 1187 1188 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) && r) { 1189 fprintf(stderr, "Hyper-V %s is not supported by kernel\n", 1190 kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc); 1191 return -ENOSYS; 1192 } 1193 1194 if (!r) { 1195 env->features[FEAT_HV_RECOMM_EAX] |= 1196 HV_ENLIGHTENED_VMCS_RECOMMENDED; 1197 env->features[FEAT_HV_NESTED_EAX] = evmcs_version; 1198 } 1199 } 1200 1201 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_CPUID) > 0) { 1202 cpuid = get_supported_hv_cpuid(cs); 1203 } else { 1204 cpuid = get_supported_hv_cpuid_legacy(cs); 1205 } 1206 1207 if (cpu->hyperv_passthrough) { 1208 memcpy(cpuid_ent, &cpuid->entries[0], 1209 cpuid->nent * sizeof(cpuid->entries[0])); 1210 1211 c = cpuid_find_entry(cpuid, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, 0); 1212 if (c) { 1213 cpu->hyperv_vendor_id[0] = c->ebx; 1214 cpu->hyperv_vendor_id[1] = c->ecx; 1215 cpu->hyperv_vendor_id[2] = c->edx; 1216 } 1217 1218 c = cpuid_find_entry(cpuid, HV_CPUID_INTERFACE, 0); 1219 if (c) { 1220 cpu->hyperv_interface_id[0] = c->eax; 1221 cpu->hyperv_interface_id[1] = c->ebx; 1222 cpu->hyperv_interface_id[2] = c->ecx; 1223 cpu->hyperv_interface_id[3] = c->edx; 1224 } 1225 1226 c = cpuid_find_entry(cpuid, HV_CPUID_VERSION, 0); 1227 if (c) { 1228 cpu->hyperv_version_id[0] = c->eax; 1229 cpu->hyperv_version_id[1] = c->ebx; 1230 cpu->hyperv_version_id[2] = c->ecx; 1231 cpu->hyperv_version_id[3] = c->edx; 1232 } 1233 1234 c = cpuid_find_entry(cpuid, HV_CPUID_FEATURES, 0); 1235 if (c) { 1236 env->features[FEAT_HYPERV_EAX] = c->eax; 1237 env->features[FEAT_HYPERV_EBX] = c->ebx; 1238 env->features[FEAT_HYPERV_EDX] = c->edx; 1239 } 1240 1241 c = cpuid_find_entry(cpuid, HV_CPUID_IMPLEMENT_LIMITS, 0); 1242 if (c) { 1243 cpu->hv_max_vps = c->eax; 1244 cpu->hyperv_limits[0] = c->ebx; 1245 cpu->hyperv_limits[1] = c->ecx; 1246 cpu->hyperv_limits[2] = c->edx; 1247 } 1248 1249 c = cpuid_find_entry(cpuid, HV_CPUID_ENLIGHTMENT_INFO, 0); 1250 if (c) { 1251 env->features[FEAT_HV_RECOMM_EAX] = c->eax; 1252 1253 /* hv-spinlocks may have been overriden */ 1254 if (cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) { 1255 c->ebx = cpu->hyperv_spinlock_attempts; 1256 } 1257 } 1258 c = cpuid_find_entry(cpuid, HV_CPUID_NESTED_FEATURES, 0); 1259 if (c) { 1260 env->features[FEAT_HV_NESTED_EAX] = c->eax; 1261 } 1262 } 1263 1264 if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) { 1265 env->features[FEAT_HV_RECOMM_EAX] |= HV_NO_NONARCH_CORESHARING; 1266 } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) { 1267 c = cpuid_find_entry(cpuid, HV_CPUID_ENLIGHTMENT_INFO, 0); 1268 if (c) { 1269 env->features[FEAT_HV_RECOMM_EAX] |= 1270 c->eax & HV_NO_NONARCH_CORESHARING; 1271 } 1272 } 1273 1274 /* Features */ 1275 r = hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_RELAXED); 1276 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_VAPIC); 1277 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_TIME); 1278 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_CRASH); 1279 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_RESET); 1280 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_VPINDEX); 1281 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_RUNTIME); 1282 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_SYNIC); 1283 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_STIMER); 1284 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_FREQUENCIES); 1285 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_REENLIGHTENMENT); 1286 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_TLBFLUSH); 1287 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_EVMCS); 1288 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_IPI); 1289 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_STIMER_DIRECT); 1290 1291 /* Additional dependencies not covered by kvm_hyperv_properties[] */ 1292 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) && 1293 !cpu->hyperv_synic_kvm_only && 1294 !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) { 1295 fprintf(stderr, "Hyper-V %s requires Hyper-V %s\n", 1296 kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc, 1297 kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc); 1298 r |= 1; 1299 } 1300 1301 /* Not exposed by KVM but needed to make CPU hotplug in Windows work */ 1302 env->features[FEAT_HYPERV_EDX] |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE; 1303 1304 if (r) { 1305 r = -ENOSYS; 1306 goto free; 1307 } 1308 1309 if (cpu->hyperv_passthrough) { 1310 /* We already copied all feature words from KVM as is */ 1311 r = cpuid->nent; 1312 goto free; 1313 } 1314 1315 c = &cpuid_ent[cpuid_i++]; 1316 c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS; 1317 c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ? 1318 HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS; 1319 c->ebx = cpu->hyperv_vendor_id[0]; 1320 c->ecx = cpu->hyperv_vendor_id[1]; 1321 c->edx = cpu->hyperv_vendor_id[2]; 1322 1323 c = &cpuid_ent[cpuid_i++]; 1324 c->function = HV_CPUID_INTERFACE; 1325 c->eax = cpu->hyperv_interface_id[0]; 1326 c->ebx = cpu->hyperv_interface_id[1]; 1327 c->ecx = cpu->hyperv_interface_id[2]; 1328 c->edx = cpu->hyperv_interface_id[3]; 1329 1330 c = &cpuid_ent[cpuid_i++]; 1331 c->function = HV_CPUID_VERSION; 1332 c->eax = cpu->hyperv_version_id[0]; 1333 c->ebx = cpu->hyperv_version_id[1]; 1334 c->ecx = cpu->hyperv_version_id[2]; 1335 c->edx = cpu->hyperv_version_id[3]; 1336 1337 c = &cpuid_ent[cpuid_i++]; 1338 c->function = HV_CPUID_FEATURES; 1339 c->eax = env->features[FEAT_HYPERV_EAX]; 1340 c->ebx = env->features[FEAT_HYPERV_EBX]; 1341 c->edx = env->features[FEAT_HYPERV_EDX]; 1342 1343 c = &cpuid_ent[cpuid_i++]; 1344 c->function = HV_CPUID_ENLIGHTMENT_INFO; 1345 c->eax = env->features[FEAT_HV_RECOMM_EAX]; 1346 c->ebx = cpu->hyperv_spinlock_attempts; 1347 1348 c = &cpuid_ent[cpuid_i++]; 1349 c->function = HV_CPUID_IMPLEMENT_LIMITS; 1350 c->eax = cpu->hv_max_vps; 1351 c->ebx = cpu->hyperv_limits[0]; 1352 c->ecx = cpu->hyperv_limits[1]; 1353 c->edx = cpu->hyperv_limits[2]; 1354 1355 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) { 1356 __u32 function; 1357 1358 /* Create zeroed 0x40000006..0x40000009 leaves */ 1359 for (function = HV_CPUID_IMPLEMENT_LIMITS + 1; 1360 function < HV_CPUID_NESTED_FEATURES; function++) { 1361 c = &cpuid_ent[cpuid_i++]; 1362 c->function = function; 1363 } 1364 1365 c = &cpuid_ent[cpuid_i++]; 1366 c->function = HV_CPUID_NESTED_FEATURES; 1367 c->eax = env->features[FEAT_HV_NESTED_EAX]; 1368 } 1369 r = cpuid_i; 1370 1371 free: 1372 g_free(cpuid); 1373 1374 return r; 1375 } 1376 1377 static Error *hv_passthrough_mig_blocker; 1378 static Error *hv_no_nonarch_cs_mig_blocker; 1379 1380 static int hyperv_init_vcpu(X86CPU *cpu) 1381 { 1382 CPUState *cs = CPU(cpu); 1383 Error *local_err = NULL; 1384 int ret; 1385 1386 if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) { 1387 error_setg(&hv_passthrough_mig_blocker, 1388 "'hv-passthrough' CPU flag prevents migration, use explicit" 1389 " set of hv-* flags instead"); 1390 ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err); 1391 if (local_err) { 1392 error_report_err(local_err); 1393 error_free(hv_passthrough_mig_blocker); 1394 return ret; 1395 } 1396 } 1397 1398 if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO && 1399 hv_no_nonarch_cs_mig_blocker == NULL) { 1400 error_setg(&hv_no_nonarch_cs_mig_blocker, 1401 "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration" 1402 " use explicit 'hv-no-nonarch-coresharing=on' instead (but" 1403 " make sure SMT is disabled and/or that vCPUs are properly" 1404 " pinned)"); 1405 ret = migrate_add_blocker(hv_no_nonarch_cs_mig_blocker, &local_err); 1406 if (local_err) { 1407 error_report_err(local_err); 1408 error_free(hv_no_nonarch_cs_mig_blocker); 1409 return ret; 1410 } 1411 } 1412 1413 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) { 1414 /* 1415 * the kernel doesn't support setting vp_index; assert that its value 1416 * is in sync 1417 */ 1418 struct { 1419 struct kvm_msrs info; 1420 struct kvm_msr_entry entries[1]; 1421 } msr_data = { 1422 .info.nmsrs = 1, 1423 .entries[0].index = HV_X64_MSR_VP_INDEX, 1424 }; 1425 1426 ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data); 1427 if (ret < 0) { 1428 return ret; 1429 } 1430 assert(ret == 1); 1431 1432 if (msr_data.entries[0].data != hyperv_vp_index(CPU(cpu))) { 1433 error_report("kernel's vp_index != QEMU's vp_index"); 1434 return -ENXIO; 1435 } 1436 } 1437 1438 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 1439 uint32_t synic_cap = cpu->hyperv_synic_kvm_only ? 1440 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2; 1441 ret = kvm_vcpu_enable_cap(cs, synic_cap, 0); 1442 if (ret < 0) { 1443 error_report("failed to turn on HyperV SynIC in KVM: %s", 1444 strerror(-ret)); 1445 return ret; 1446 } 1447 1448 if (!cpu->hyperv_synic_kvm_only) { 1449 ret = hyperv_x86_synic_add(cpu); 1450 if (ret < 0) { 1451 error_report("failed to create HyperV SynIC: %s", 1452 strerror(-ret)); 1453 return ret; 1454 } 1455 } 1456 } 1457 1458 return 0; 1459 } 1460 1461 static Error *invtsc_mig_blocker; 1462 1463 #define KVM_MAX_CPUID_ENTRIES 100 1464 1465 int kvm_arch_init_vcpu(CPUState *cs) 1466 { 1467 struct { 1468 struct kvm_cpuid2 cpuid; 1469 struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES]; 1470 } cpuid_data; 1471 /* 1472 * The kernel defines these structs with padding fields so there 1473 * should be no extra padding in our cpuid_data struct. 1474 */ 1475 QEMU_BUILD_BUG_ON(sizeof(cpuid_data) != 1476 sizeof(struct kvm_cpuid2) + 1477 sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES); 1478 1479 X86CPU *cpu = X86_CPU(cs); 1480 CPUX86State *env = &cpu->env; 1481 uint32_t limit, i, j, cpuid_i; 1482 uint32_t unused; 1483 struct kvm_cpuid_entry2 *c; 1484 uint32_t signature[3]; 1485 int kvm_base = KVM_CPUID_SIGNATURE; 1486 int max_nested_state_len; 1487 int r; 1488 Error *local_err = NULL; 1489 1490 memset(&cpuid_data, 0, sizeof(cpuid_data)); 1491 1492 cpuid_i = 0; 1493 1494 r = kvm_arch_set_tsc_khz(cs); 1495 if (r < 0) { 1496 return r; 1497 } 1498 1499 /* vcpu's TSC frequency is either specified by user, or following 1500 * the value used by KVM if the former is not present. In the 1501 * latter case, we query it from KVM and record in env->tsc_khz, 1502 * so that vcpu's TSC frequency can be migrated later via this field. 1503 */ 1504 if (!env->tsc_khz) { 1505 r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ? 1506 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : 1507 -ENOTSUP; 1508 if (r > 0) { 1509 env->tsc_khz = r; 1510 } 1511 } 1512 1513 env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY; 1514 1515 /* Paravirtualization CPUIDs */ 1516 r = hyperv_handle_properties(cs, cpuid_data.entries); 1517 if (r < 0) { 1518 return r; 1519 } else if (r > 0) { 1520 cpuid_i = r; 1521 kvm_base = KVM_CPUID_SIGNATURE_NEXT; 1522 has_msr_hv_hypercall = true; 1523 } 1524 1525 if (cpu->expose_kvm) { 1526 memcpy(signature, "KVMKVMKVM\0\0\0", 12); 1527 c = &cpuid_data.entries[cpuid_i++]; 1528 c->function = KVM_CPUID_SIGNATURE | kvm_base; 1529 c->eax = KVM_CPUID_FEATURES | kvm_base; 1530 c->ebx = signature[0]; 1531 c->ecx = signature[1]; 1532 c->edx = signature[2]; 1533 1534 c = &cpuid_data.entries[cpuid_i++]; 1535 c->function = KVM_CPUID_FEATURES | kvm_base; 1536 c->eax = env->features[FEAT_KVM]; 1537 c->edx = env->features[FEAT_KVM_HINTS]; 1538 } 1539 1540 cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused); 1541 1542 for (i = 0; i <= limit; i++) { 1543 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1544 fprintf(stderr, "unsupported level value: 0x%x\n", limit); 1545 abort(); 1546 } 1547 c = &cpuid_data.entries[cpuid_i++]; 1548 1549 switch (i) { 1550 case 2: { 1551 /* Keep reading function 2 till all the input is received */ 1552 int times; 1553 1554 c->function = i; 1555 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC | 1556 KVM_CPUID_FLAG_STATE_READ_NEXT; 1557 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1558 times = c->eax & 0xff; 1559 1560 for (j = 1; j < times; ++j) { 1561 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1562 fprintf(stderr, "cpuid_data is full, no space for " 1563 "cpuid(eax:2):eax & 0xf = 0x%x\n", times); 1564 abort(); 1565 } 1566 c = &cpuid_data.entries[cpuid_i++]; 1567 c->function = i; 1568 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC; 1569 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1570 } 1571 break; 1572 } 1573 case 0x1f: 1574 if (env->nr_dies < 2) { 1575 break; 1576 } 1577 /* fallthrough */ 1578 case 4: 1579 case 0xb: 1580 case 0xd: 1581 for (j = 0; ; j++) { 1582 if (i == 0xd && j == 64) { 1583 break; 1584 } 1585 1586 if (i == 0x1f && j == 64) { 1587 break; 1588 } 1589 1590 c->function = i; 1591 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1592 c->index = j; 1593 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 1594 1595 if (i == 4 && c->eax == 0) { 1596 break; 1597 } 1598 if (i == 0xb && !(c->ecx & 0xff00)) { 1599 break; 1600 } 1601 if (i == 0x1f && !(c->ecx & 0xff00)) { 1602 break; 1603 } 1604 if (i == 0xd && c->eax == 0) { 1605 continue; 1606 } 1607 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1608 fprintf(stderr, "cpuid_data is full, no space for " 1609 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j); 1610 abort(); 1611 } 1612 c = &cpuid_data.entries[cpuid_i++]; 1613 } 1614 break; 1615 case 0x7: 1616 case 0x14: { 1617 uint32_t times; 1618 1619 c->function = i; 1620 c->index = 0; 1621 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1622 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1623 times = c->eax; 1624 1625 for (j = 1; j <= times; ++j) { 1626 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1627 fprintf(stderr, "cpuid_data is full, no space for " 1628 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j); 1629 abort(); 1630 } 1631 c = &cpuid_data.entries[cpuid_i++]; 1632 c->function = i; 1633 c->index = j; 1634 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1635 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 1636 } 1637 break; 1638 } 1639 default: 1640 c->function = i; 1641 c->flags = 0; 1642 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1643 if (!c->eax && !c->ebx && !c->ecx && !c->edx) { 1644 /* 1645 * KVM already returns all zeroes if a CPUID entry is missing, 1646 * so we can omit it and avoid hitting KVM's 80-entry limit. 1647 */ 1648 cpuid_i--; 1649 } 1650 break; 1651 } 1652 } 1653 1654 if (limit >= 0x0a) { 1655 uint32_t eax, edx; 1656 1657 cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx); 1658 1659 has_architectural_pmu_version = eax & 0xff; 1660 if (has_architectural_pmu_version > 0) { 1661 num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8; 1662 1663 /* Shouldn't be more than 32, since that's the number of bits 1664 * available in EBX to tell us _which_ counters are available. 1665 * Play it safe. 1666 */ 1667 if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) { 1668 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS; 1669 } 1670 1671 if (has_architectural_pmu_version > 1) { 1672 num_architectural_pmu_fixed_counters = edx & 0x1f; 1673 1674 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) { 1675 num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS; 1676 } 1677 } 1678 } 1679 } 1680 1681 cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused); 1682 1683 for (i = 0x80000000; i <= limit; i++) { 1684 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1685 fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit); 1686 abort(); 1687 } 1688 c = &cpuid_data.entries[cpuid_i++]; 1689 1690 switch (i) { 1691 case 0x8000001d: 1692 /* Query for all AMD cache information leaves */ 1693 for (j = 0; ; j++) { 1694 c->function = i; 1695 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1696 c->index = j; 1697 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx); 1698 1699 if (c->eax == 0) { 1700 break; 1701 } 1702 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1703 fprintf(stderr, "cpuid_data is full, no space for " 1704 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j); 1705 abort(); 1706 } 1707 c = &cpuid_data.entries[cpuid_i++]; 1708 } 1709 break; 1710 default: 1711 c->function = i; 1712 c->flags = 0; 1713 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1714 if (!c->eax && !c->ebx && !c->ecx && !c->edx) { 1715 /* 1716 * KVM already returns all zeroes if a CPUID entry is missing, 1717 * so we can omit it and avoid hitting KVM's 80-entry limit. 1718 */ 1719 cpuid_i--; 1720 } 1721 break; 1722 } 1723 } 1724 1725 /* Call Centaur's CPUID instructions they are supported. */ 1726 if (env->cpuid_xlevel2 > 0) { 1727 cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused); 1728 1729 for (i = 0xC0000000; i <= limit; i++) { 1730 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) { 1731 fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit); 1732 abort(); 1733 } 1734 c = &cpuid_data.entries[cpuid_i++]; 1735 1736 c->function = i; 1737 c->flags = 0; 1738 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx); 1739 } 1740 } 1741 1742 cpuid_data.cpuid.nent = cpuid_i; 1743 1744 if (((env->cpuid_version >> 8)&0xF) >= 6 1745 && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) == 1746 (CPUID_MCE | CPUID_MCA) 1747 && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) { 1748 uint64_t mcg_cap, unsupported_caps; 1749 int banks; 1750 int ret; 1751 1752 ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks); 1753 if (ret < 0) { 1754 fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret)); 1755 return ret; 1756 } 1757 1758 if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) { 1759 error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)", 1760 (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks); 1761 return -ENOTSUP; 1762 } 1763 1764 unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK); 1765 if (unsupported_caps) { 1766 if (unsupported_caps & MCG_LMCE_P) { 1767 error_report("kvm: LMCE not supported"); 1768 return -ENOTSUP; 1769 } 1770 warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64, 1771 unsupported_caps); 1772 } 1773 1774 env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK; 1775 ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap); 1776 if (ret < 0) { 1777 fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret)); 1778 return ret; 1779 } 1780 } 1781 1782 cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env); 1783 1784 c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0); 1785 if (c) { 1786 has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) || 1787 !!(c->ecx & CPUID_EXT_SMX); 1788 } 1789 1790 if (env->mcg_cap & MCG_LMCE_P) { 1791 has_msr_mcg_ext_ctl = has_msr_feature_control = true; 1792 } 1793 1794 if (!env->user_tsc_khz) { 1795 if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) && 1796 invtsc_mig_blocker == NULL) { 1797 error_setg(&invtsc_mig_blocker, 1798 "State blocked by non-migratable CPU device" 1799 " (invtsc flag)"); 1800 r = migrate_add_blocker(invtsc_mig_blocker, &local_err); 1801 if (local_err) { 1802 error_report_err(local_err); 1803 error_free(invtsc_mig_blocker); 1804 return r; 1805 } 1806 } 1807 } 1808 1809 if (cpu->vmware_cpuid_freq 1810 /* Guests depend on 0x40000000 to detect this feature, so only expose 1811 * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */ 1812 && cpu->expose_kvm 1813 && kvm_base == KVM_CPUID_SIGNATURE 1814 /* TSC clock must be stable and known for this feature. */ 1815 && tsc_is_stable_and_known(env)) { 1816 1817 c = &cpuid_data.entries[cpuid_i++]; 1818 c->function = KVM_CPUID_SIGNATURE | 0x10; 1819 c->eax = env->tsc_khz; 1820 c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */ 1821 c->ecx = c->edx = 0; 1822 1823 c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0); 1824 c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10); 1825 } 1826 1827 cpuid_data.cpuid.nent = cpuid_i; 1828 1829 cpuid_data.cpuid.padding = 0; 1830 r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data); 1831 if (r) { 1832 goto fail; 1833 } 1834 1835 if (has_xsave) { 1836 env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave)); 1837 memset(env->xsave_buf, 0, sizeof(struct kvm_xsave)); 1838 } 1839 1840 max_nested_state_len = kvm_max_nested_state_length(); 1841 if (max_nested_state_len > 0) { 1842 assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data)); 1843 1844 if (cpu_has_vmx(env) || cpu_has_svm(env)) { 1845 struct kvm_vmx_nested_state_hdr *vmx_hdr; 1846 1847 env->nested_state = g_malloc0(max_nested_state_len); 1848 env->nested_state->size = max_nested_state_len; 1849 1850 if (cpu_has_vmx(env)) { 1851 env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX; 1852 vmx_hdr = &env->nested_state->hdr.vmx; 1853 vmx_hdr->vmxon_pa = -1ull; 1854 vmx_hdr->vmcs12_pa = -1ull; 1855 } else { 1856 env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM; 1857 } 1858 } 1859 } 1860 1861 cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE); 1862 1863 if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) { 1864 has_msr_tsc_aux = false; 1865 } 1866 1867 kvm_init_msrs(cpu); 1868 1869 r = hyperv_init_vcpu(cpu); 1870 if (r) { 1871 goto fail; 1872 } 1873 1874 return 0; 1875 1876 fail: 1877 migrate_del_blocker(invtsc_mig_blocker); 1878 1879 return r; 1880 } 1881 1882 int kvm_arch_destroy_vcpu(CPUState *cs) 1883 { 1884 X86CPU *cpu = X86_CPU(cs); 1885 CPUX86State *env = &cpu->env; 1886 1887 if (cpu->kvm_msr_buf) { 1888 g_free(cpu->kvm_msr_buf); 1889 cpu->kvm_msr_buf = NULL; 1890 } 1891 1892 if (env->nested_state) { 1893 g_free(env->nested_state); 1894 env->nested_state = NULL; 1895 } 1896 1897 qemu_del_vm_change_state_handler(cpu->vmsentry); 1898 1899 return 0; 1900 } 1901 1902 void kvm_arch_reset_vcpu(X86CPU *cpu) 1903 { 1904 CPUX86State *env = &cpu->env; 1905 1906 env->xcr0 = 1; 1907 if (kvm_irqchip_in_kernel()) { 1908 env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE : 1909 KVM_MP_STATE_UNINITIALIZED; 1910 } else { 1911 env->mp_state = KVM_MP_STATE_RUNNABLE; 1912 } 1913 1914 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 1915 int i; 1916 for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) { 1917 env->msr_hv_synic_sint[i] = HV_SINT_MASKED; 1918 } 1919 1920 hyperv_x86_synic_reset(cpu); 1921 } 1922 /* enabled by default */ 1923 env->poll_control_msr = 1; 1924 } 1925 1926 void kvm_arch_do_init_vcpu(X86CPU *cpu) 1927 { 1928 CPUX86State *env = &cpu->env; 1929 1930 /* APs get directly into wait-for-SIPI state. */ 1931 if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) { 1932 env->mp_state = KVM_MP_STATE_INIT_RECEIVED; 1933 } 1934 } 1935 1936 static int kvm_get_supported_feature_msrs(KVMState *s) 1937 { 1938 int ret = 0; 1939 1940 if (kvm_feature_msrs != NULL) { 1941 return 0; 1942 } 1943 1944 if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) { 1945 return 0; 1946 } 1947 1948 struct kvm_msr_list msr_list; 1949 1950 msr_list.nmsrs = 0; 1951 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list); 1952 if (ret < 0 && ret != -E2BIG) { 1953 error_report("Fetch KVM feature MSR list failed: %s", 1954 strerror(-ret)); 1955 return ret; 1956 } 1957 1958 assert(msr_list.nmsrs > 0); 1959 kvm_feature_msrs = (struct kvm_msr_list *) \ 1960 g_malloc0(sizeof(msr_list) + 1961 msr_list.nmsrs * sizeof(msr_list.indices[0])); 1962 1963 kvm_feature_msrs->nmsrs = msr_list.nmsrs; 1964 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs); 1965 1966 if (ret < 0) { 1967 error_report("Fetch KVM feature MSR list failed: %s", 1968 strerror(-ret)); 1969 g_free(kvm_feature_msrs); 1970 kvm_feature_msrs = NULL; 1971 return ret; 1972 } 1973 1974 return 0; 1975 } 1976 1977 static int kvm_get_supported_msrs(KVMState *s) 1978 { 1979 int ret = 0; 1980 struct kvm_msr_list msr_list, *kvm_msr_list; 1981 1982 /* 1983 * Obtain MSR list from KVM. These are the MSRs that we must 1984 * save/restore. 1985 */ 1986 msr_list.nmsrs = 0; 1987 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list); 1988 if (ret < 0 && ret != -E2BIG) { 1989 return ret; 1990 } 1991 /* 1992 * Old kernel modules had a bug and could write beyond the provided 1993 * memory. Allocate at least a safe amount of 1K. 1994 */ 1995 kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) + 1996 msr_list.nmsrs * 1997 sizeof(msr_list.indices[0]))); 1998 1999 kvm_msr_list->nmsrs = msr_list.nmsrs; 2000 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list); 2001 if (ret >= 0) { 2002 int i; 2003 2004 for (i = 0; i < kvm_msr_list->nmsrs; i++) { 2005 switch (kvm_msr_list->indices[i]) { 2006 case MSR_STAR: 2007 has_msr_star = true; 2008 break; 2009 case MSR_VM_HSAVE_PA: 2010 has_msr_hsave_pa = true; 2011 break; 2012 case MSR_TSC_AUX: 2013 has_msr_tsc_aux = true; 2014 break; 2015 case MSR_TSC_ADJUST: 2016 has_msr_tsc_adjust = true; 2017 break; 2018 case MSR_IA32_TSCDEADLINE: 2019 has_msr_tsc_deadline = true; 2020 break; 2021 case MSR_IA32_SMBASE: 2022 has_msr_smbase = true; 2023 break; 2024 case MSR_SMI_COUNT: 2025 has_msr_smi_count = true; 2026 break; 2027 case MSR_IA32_MISC_ENABLE: 2028 has_msr_misc_enable = true; 2029 break; 2030 case MSR_IA32_BNDCFGS: 2031 has_msr_bndcfgs = true; 2032 break; 2033 case MSR_IA32_XSS: 2034 has_msr_xss = true; 2035 break; 2036 case MSR_IA32_UMWAIT_CONTROL: 2037 has_msr_umwait = true; 2038 break; 2039 case HV_X64_MSR_CRASH_CTL: 2040 has_msr_hv_crash = true; 2041 break; 2042 case HV_X64_MSR_RESET: 2043 has_msr_hv_reset = true; 2044 break; 2045 case HV_X64_MSR_VP_INDEX: 2046 has_msr_hv_vpindex = true; 2047 break; 2048 case HV_X64_MSR_VP_RUNTIME: 2049 has_msr_hv_runtime = true; 2050 break; 2051 case HV_X64_MSR_SCONTROL: 2052 has_msr_hv_synic = true; 2053 break; 2054 case HV_X64_MSR_STIMER0_CONFIG: 2055 has_msr_hv_stimer = true; 2056 break; 2057 case HV_X64_MSR_TSC_FREQUENCY: 2058 has_msr_hv_frequencies = true; 2059 break; 2060 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 2061 has_msr_hv_reenlightenment = true; 2062 break; 2063 case MSR_IA32_SPEC_CTRL: 2064 has_msr_spec_ctrl = true; 2065 break; 2066 case MSR_IA32_TSX_CTRL: 2067 has_msr_tsx_ctrl = true; 2068 break; 2069 case MSR_VIRT_SSBD: 2070 has_msr_virt_ssbd = true; 2071 break; 2072 case MSR_IA32_ARCH_CAPABILITIES: 2073 has_msr_arch_capabs = true; 2074 break; 2075 case MSR_IA32_CORE_CAPABILITY: 2076 has_msr_core_capabs = true; 2077 break; 2078 case MSR_IA32_PERF_CAPABILITIES: 2079 has_msr_perf_capabs = true; 2080 break; 2081 case MSR_IA32_VMX_VMFUNC: 2082 has_msr_vmx_vmfunc = true; 2083 break; 2084 case MSR_IA32_UCODE_REV: 2085 has_msr_ucode_rev = true; 2086 break; 2087 case MSR_IA32_VMX_PROCBASED_CTLS2: 2088 has_msr_vmx_procbased_ctls2 = true; 2089 break; 2090 } 2091 } 2092 } 2093 2094 g_free(kvm_msr_list); 2095 2096 return ret; 2097 } 2098 2099 static Notifier smram_machine_done; 2100 static KVMMemoryListener smram_listener; 2101 static AddressSpace smram_address_space; 2102 static MemoryRegion smram_as_root; 2103 static MemoryRegion smram_as_mem; 2104 2105 static void register_smram_listener(Notifier *n, void *unused) 2106 { 2107 MemoryRegion *smram = 2108 (MemoryRegion *) object_resolve_path("/machine/smram", NULL); 2109 2110 /* Outer container... */ 2111 memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull); 2112 memory_region_set_enabled(&smram_as_root, true); 2113 2114 /* ... with two regions inside: normal system memory with low 2115 * priority, and... 2116 */ 2117 memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram", 2118 get_system_memory(), 0, ~0ull); 2119 memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0); 2120 memory_region_set_enabled(&smram_as_mem, true); 2121 2122 if (smram) { 2123 /* ... SMRAM with higher priority */ 2124 memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10); 2125 memory_region_set_enabled(smram, true); 2126 } 2127 2128 address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM"); 2129 kvm_memory_listener_register(kvm_state, &smram_listener, 2130 &smram_address_space, 1); 2131 } 2132 2133 int kvm_arch_init(MachineState *ms, KVMState *s) 2134 { 2135 uint64_t identity_base = 0xfffbc000; 2136 uint64_t shadow_mem; 2137 int ret; 2138 struct utsname utsname; 2139 Error *local_err = NULL; 2140 2141 /* 2142 * Initialize SEV context, if required 2143 * 2144 * If no memory encryption is requested (ms->cgs == NULL) this is 2145 * a no-op. 2146 * 2147 * It's also a no-op if a non-SEV confidential guest support 2148 * mechanism is selected. SEV is the only mechanism available to 2149 * select on x86 at present, so this doesn't arise, but if new 2150 * mechanisms are supported in future (e.g. TDX), they'll need 2151 * their own initialization either here or elsewhere. 2152 */ 2153 ret = sev_kvm_init(ms->cgs, &local_err); 2154 if (ret < 0) { 2155 error_report_err(local_err); 2156 return ret; 2157 } 2158 2159 if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) { 2160 error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM"); 2161 return -ENOTSUP; 2162 } 2163 2164 has_xsave = kvm_check_extension(s, KVM_CAP_XSAVE); 2165 has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS); 2166 has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2); 2167 2168 hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX); 2169 2170 has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD); 2171 if (has_exception_payload) { 2172 ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true); 2173 if (ret < 0) { 2174 error_report("kvm: Failed to enable exception payload cap: %s", 2175 strerror(-ret)); 2176 return ret; 2177 } 2178 } 2179 2180 ret = kvm_get_supported_msrs(s); 2181 if (ret < 0) { 2182 return ret; 2183 } 2184 2185 kvm_get_supported_feature_msrs(s); 2186 2187 uname(&utsname); 2188 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0; 2189 2190 /* 2191 * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly. 2192 * In order to use vm86 mode, an EPT identity map and a TSS are needed. 2193 * Since these must be part of guest physical memory, we need to allocate 2194 * them, both by setting their start addresses in the kernel and by 2195 * creating a corresponding e820 entry. We need 4 pages before the BIOS. 2196 * 2197 * Older KVM versions may not support setting the identity map base. In 2198 * that case we need to stick with the default, i.e. a 256K maximum BIOS 2199 * size. 2200 */ 2201 if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) { 2202 /* Allows up to 16M BIOSes. */ 2203 identity_base = 0xfeffc000; 2204 2205 ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base); 2206 if (ret < 0) { 2207 return ret; 2208 } 2209 } 2210 2211 /* Set TSS base one page after EPT identity map. */ 2212 ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000); 2213 if (ret < 0) { 2214 return ret; 2215 } 2216 2217 /* Tell fw_cfg to notify the BIOS to reserve the range. */ 2218 ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED); 2219 if (ret < 0) { 2220 fprintf(stderr, "e820_add_entry() table is full\n"); 2221 return ret; 2222 } 2223 2224 shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort); 2225 if (shadow_mem != -1) { 2226 shadow_mem /= 4096; 2227 ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem); 2228 if (ret < 0) { 2229 return ret; 2230 } 2231 } 2232 2233 if (kvm_check_extension(s, KVM_CAP_X86_SMM) && 2234 object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) && 2235 x86_machine_is_smm_enabled(X86_MACHINE(ms))) { 2236 smram_machine_done.notify = register_smram_listener; 2237 qemu_add_machine_init_done_notifier(&smram_machine_done); 2238 } 2239 2240 if (enable_cpu_pm) { 2241 int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS); 2242 int ret; 2243 2244 /* Work around for kernel header with a typo. TODO: fix header and drop. */ 2245 #if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT) 2246 #define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL 2247 #endif 2248 if (disable_exits) { 2249 disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT | 2250 KVM_X86_DISABLE_EXITS_HLT | 2251 KVM_X86_DISABLE_EXITS_PAUSE | 2252 KVM_X86_DISABLE_EXITS_CSTATE); 2253 } 2254 2255 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0, 2256 disable_exits); 2257 if (ret < 0) { 2258 error_report("kvm: guest stopping CPU not supported: %s", 2259 strerror(-ret)); 2260 } 2261 } 2262 2263 return 0; 2264 } 2265 2266 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs) 2267 { 2268 lhs->selector = rhs->selector; 2269 lhs->base = rhs->base; 2270 lhs->limit = rhs->limit; 2271 lhs->type = 3; 2272 lhs->present = 1; 2273 lhs->dpl = 3; 2274 lhs->db = 0; 2275 lhs->s = 1; 2276 lhs->l = 0; 2277 lhs->g = 0; 2278 lhs->avl = 0; 2279 lhs->unusable = 0; 2280 } 2281 2282 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs) 2283 { 2284 unsigned flags = rhs->flags; 2285 lhs->selector = rhs->selector; 2286 lhs->base = rhs->base; 2287 lhs->limit = rhs->limit; 2288 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15; 2289 lhs->present = (flags & DESC_P_MASK) != 0; 2290 lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3; 2291 lhs->db = (flags >> DESC_B_SHIFT) & 1; 2292 lhs->s = (flags & DESC_S_MASK) != 0; 2293 lhs->l = (flags >> DESC_L_SHIFT) & 1; 2294 lhs->g = (flags & DESC_G_MASK) != 0; 2295 lhs->avl = (flags & DESC_AVL_MASK) != 0; 2296 lhs->unusable = !lhs->present; 2297 lhs->padding = 0; 2298 } 2299 2300 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs) 2301 { 2302 lhs->selector = rhs->selector; 2303 lhs->base = rhs->base; 2304 lhs->limit = rhs->limit; 2305 lhs->flags = (rhs->type << DESC_TYPE_SHIFT) | 2306 ((rhs->present && !rhs->unusable) * DESC_P_MASK) | 2307 (rhs->dpl << DESC_DPL_SHIFT) | 2308 (rhs->db << DESC_B_SHIFT) | 2309 (rhs->s * DESC_S_MASK) | 2310 (rhs->l << DESC_L_SHIFT) | 2311 (rhs->g * DESC_G_MASK) | 2312 (rhs->avl * DESC_AVL_MASK); 2313 } 2314 2315 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set) 2316 { 2317 if (set) { 2318 *kvm_reg = *qemu_reg; 2319 } else { 2320 *qemu_reg = *kvm_reg; 2321 } 2322 } 2323 2324 static int kvm_getput_regs(X86CPU *cpu, int set) 2325 { 2326 CPUX86State *env = &cpu->env; 2327 struct kvm_regs regs; 2328 int ret = 0; 2329 2330 if (!set) { 2331 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, ®s); 2332 if (ret < 0) { 2333 return ret; 2334 } 2335 } 2336 2337 kvm_getput_reg(®s.rax, &env->regs[R_EAX], set); 2338 kvm_getput_reg(®s.rbx, &env->regs[R_EBX], set); 2339 kvm_getput_reg(®s.rcx, &env->regs[R_ECX], set); 2340 kvm_getput_reg(®s.rdx, &env->regs[R_EDX], set); 2341 kvm_getput_reg(®s.rsi, &env->regs[R_ESI], set); 2342 kvm_getput_reg(®s.rdi, &env->regs[R_EDI], set); 2343 kvm_getput_reg(®s.rsp, &env->regs[R_ESP], set); 2344 kvm_getput_reg(®s.rbp, &env->regs[R_EBP], set); 2345 #ifdef TARGET_X86_64 2346 kvm_getput_reg(®s.r8, &env->regs[8], set); 2347 kvm_getput_reg(®s.r9, &env->regs[9], set); 2348 kvm_getput_reg(®s.r10, &env->regs[10], set); 2349 kvm_getput_reg(®s.r11, &env->regs[11], set); 2350 kvm_getput_reg(®s.r12, &env->regs[12], set); 2351 kvm_getput_reg(®s.r13, &env->regs[13], set); 2352 kvm_getput_reg(®s.r14, &env->regs[14], set); 2353 kvm_getput_reg(®s.r15, &env->regs[15], set); 2354 #endif 2355 2356 kvm_getput_reg(®s.rflags, &env->eflags, set); 2357 kvm_getput_reg(®s.rip, &env->eip, set); 2358 2359 if (set) { 2360 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, ®s); 2361 } 2362 2363 return ret; 2364 } 2365 2366 static int kvm_put_fpu(X86CPU *cpu) 2367 { 2368 CPUX86State *env = &cpu->env; 2369 struct kvm_fpu fpu; 2370 int i; 2371 2372 memset(&fpu, 0, sizeof fpu); 2373 fpu.fsw = env->fpus & ~(7 << 11); 2374 fpu.fsw |= (env->fpstt & 7) << 11; 2375 fpu.fcw = env->fpuc; 2376 fpu.last_opcode = env->fpop; 2377 fpu.last_ip = env->fpip; 2378 fpu.last_dp = env->fpdp; 2379 for (i = 0; i < 8; ++i) { 2380 fpu.ftwx |= (!env->fptags[i]) << i; 2381 } 2382 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs); 2383 for (i = 0; i < CPU_NB_REGS; i++) { 2384 stq_p(&fpu.xmm[i][0], env->xmm_regs[i].ZMM_Q(0)); 2385 stq_p(&fpu.xmm[i][8], env->xmm_regs[i].ZMM_Q(1)); 2386 } 2387 fpu.mxcsr = env->mxcsr; 2388 2389 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu); 2390 } 2391 2392 #define XSAVE_FCW_FSW 0 2393 #define XSAVE_FTW_FOP 1 2394 #define XSAVE_CWD_RIP 2 2395 #define XSAVE_CWD_RDP 4 2396 #define XSAVE_MXCSR 6 2397 #define XSAVE_ST_SPACE 8 2398 #define XSAVE_XMM_SPACE 40 2399 #define XSAVE_XSTATE_BV 128 2400 #define XSAVE_YMMH_SPACE 144 2401 #define XSAVE_BNDREGS 240 2402 #define XSAVE_BNDCSR 256 2403 #define XSAVE_OPMASK 272 2404 #define XSAVE_ZMM_Hi256 288 2405 #define XSAVE_Hi16_ZMM 416 2406 #define XSAVE_PKRU 672 2407 2408 #define XSAVE_BYTE_OFFSET(word_offset) \ 2409 ((word_offset) * sizeof_field(struct kvm_xsave, region[0])) 2410 2411 #define ASSERT_OFFSET(word_offset, field) \ 2412 QEMU_BUILD_BUG_ON(XSAVE_BYTE_OFFSET(word_offset) != \ 2413 offsetof(X86XSaveArea, field)) 2414 2415 ASSERT_OFFSET(XSAVE_FCW_FSW, legacy.fcw); 2416 ASSERT_OFFSET(XSAVE_FTW_FOP, legacy.ftw); 2417 ASSERT_OFFSET(XSAVE_CWD_RIP, legacy.fpip); 2418 ASSERT_OFFSET(XSAVE_CWD_RDP, legacy.fpdp); 2419 ASSERT_OFFSET(XSAVE_MXCSR, legacy.mxcsr); 2420 ASSERT_OFFSET(XSAVE_ST_SPACE, legacy.fpregs); 2421 ASSERT_OFFSET(XSAVE_XMM_SPACE, legacy.xmm_regs); 2422 ASSERT_OFFSET(XSAVE_XSTATE_BV, header.xstate_bv); 2423 ASSERT_OFFSET(XSAVE_YMMH_SPACE, avx_state); 2424 ASSERT_OFFSET(XSAVE_BNDREGS, bndreg_state); 2425 ASSERT_OFFSET(XSAVE_BNDCSR, bndcsr_state); 2426 ASSERT_OFFSET(XSAVE_OPMASK, opmask_state); 2427 ASSERT_OFFSET(XSAVE_ZMM_Hi256, zmm_hi256_state); 2428 ASSERT_OFFSET(XSAVE_Hi16_ZMM, hi16_zmm_state); 2429 ASSERT_OFFSET(XSAVE_PKRU, pkru_state); 2430 2431 static int kvm_put_xsave(X86CPU *cpu) 2432 { 2433 CPUX86State *env = &cpu->env; 2434 X86XSaveArea *xsave = env->xsave_buf; 2435 2436 if (!has_xsave) { 2437 return kvm_put_fpu(cpu); 2438 } 2439 x86_cpu_xsave_all_areas(cpu, xsave); 2440 2441 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave); 2442 } 2443 2444 static int kvm_put_xcrs(X86CPU *cpu) 2445 { 2446 CPUX86State *env = &cpu->env; 2447 struct kvm_xcrs xcrs = {}; 2448 2449 if (!has_xcrs) { 2450 return 0; 2451 } 2452 2453 xcrs.nr_xcrs = 1; 2454 xcrs.flags = 0; 2455 xcrs.xcrs[0].xcr = 0; 2456 xcrs.xcrs[0].value = env->xcr0; 2457 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs); 2458 } 2459 2460 static int kvm_put_sregs(X86CPU *cpu) 2461 { 2462 CPUX86State *env = &cpu->env; 2463 struct kvm_sregs sregs; 2464 2465 memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap)); 2466 if (env->interrupt_injected >= 0) { 2467 sregs.interrupt_bitmap[env->interrupt_injected / 64] |= 2468 (uint64_t)1 << (env->interrupt_injected % 64); 2469 } 2470 2471 if ((env->eflags & VM_MASK)) { 2472 set_v8086_seg(&sregs.cs, &env->segs[R_CS]); 2473 set_v8086_seg(&sregs.ds, &env->segs[R_DS]); 2474 set_v8086_seg(&sregs.es, &env->segs[R_ES]); 2475 set_v8086_seg(&sregs.fs, &env->segs[R_FS]); 2476 set_v8086_seg(&sregs.gs, &env->segs[R_GS]); 2477 set_v8086_seg(&sregs.ss, &env->segs[R_SS]); 2478 } else { 2479 set_seg(&sregs.cs, &env->segs[R_CS]); 2480 set_seg(&sregs.ds, &env->segs[R_DS]); 2481 set_seg(&sregs.es, &env->segs[R_ES]); 2482 set_seg(&sregs.fs, &env->segs[R_FS]); 2483 set_seg(&sregs.gs, &env->segs[R_GS]); 2484 set_seg(&sregs.ss, &env->segs[R_SS]); 2485 } 2486 2487 set_seg(&sregs.tr, &env->tr); 2488 set_seg(&sregs.ldt, &env->ldt); 2489 2490 sregs.idt.limit = env->idt.limit; 2491 sregs.idt.base = env->idt.base; 2492 memset(sregs.idt.padding, 0, sizeof sregs.idt.padding); 2493 sregs.gdt.limit = env->gdt.limit; 2494 sregs.gdt.base = env->gdt.base; 2495 memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding); 2496 2497 sregs.cr0 = env->cr[0]; 2498 sregs.cr2 = env->cr[2]; 2499 sregs.cr3 = env->cr[3]; 2500 sregs.cr4 = env->cr[4]; 2501 2502 sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state); 2503 sregs.apic_base = cpu_get_apic_base(cpu->apic_state); 2504 2505 sregs.efer = env->efer; 2506 2507 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs); 2508 } 2509 2510 static void kvm_msr_buf_reset(X86CPU *cpu) 2511 { 2512 memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE); 2513 } 2514 2515 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value) 2516 { 2517 struct kvm_msrs *msrs = cpu->kvm_msr_buf; 2518 void *limit = ((void *)msrs) + MSR_BUF_SIZE; 2519 struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs]; 2520 2521 assert((void *)(entry + 1) <= limit); 2522 2523 entry->index = index; 2524 entry->reserved = 0; 2525 entry->data = value; 2526 msrs->nmsrs++; 2527 } 2528 2529 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value) 2530 { 2531 kvm_msr_buf_reset(cpu); 2532 kvm_msr_entry_add(cpu, index, value); 2533 2534 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); 2535 } 2536 2537 void kvm_put_apicbase(X86CPU *cpu, uint64_t value) 2538 { 2539 int ret; 2540 2541 ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value); 2542 assert(ret == 1); 2543 } 2544 2545 static int kvm_put_tscdeadline_msr(X86CPU *cpu) 2546 { 2547 CPUX86State *env = &cpu->env; 2548 int ret; 2549 2550 if (!has_msr_tsc_deadline) { 2551 return 0; 2552 } 2553 2554 ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline); 2555 if (ret < 0) { 2556 return ret; 2557 } 2558 2559 assert(ret == 1); 2560 return 0; 2561 } 2562 2563 /* 2564 * Provide a separate write service for the feature control MSR in order to 2565 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done 2566 * before writing any other state because forcibly leaving nested mode 2567 * invalidates the VCPU state. 2568 */ 2569 static int kvm_put_msr_feature_control(X86CPU *cpu) 2570 { 2571 int ret; 2572 2573 if (!has_msr_feature_control) { 2574 return 0; 2575 } 2576 2577 ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL, 2578 cpu->env.msr_ia32_feature_control); 2579 if (ret < 0) { 2580 return ret; 2581 } 2582 2583 assert(ret == 1); 2584 return 0; 2585 } 2586 2587 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features) 2588 { 2589 uint32_t default1, can_be_one, can_be_zero; 2590 uint32_t must_be_one; 2591 2592 switch (index) { 2593 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 2594 default1 = 0x00000016; 2595 break; 2596 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 2597 default1 = 0x0401e172; 2598 break; 2599 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 2600 default1 = 0x000011ff; 2601 break; 2602 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 2603 default1 = 0x00036dff; 2604 break; 2605 case MSR_IA32_VMX_PROCBASED_CTLS2: 2606 default1 = 0; 2607 break; 2608 default: 2609 abort(); 2610 } 2611 2612 /* If a feature bit is set, the control can be either set or clear. 2613 * Otherwise the value is limited to either 0 or 1 by default1. 2614 */ 2615 can_be_one = features | default1; 2616 can_be_zero = features | ~default1; 2617 must_be_one = ~can_be_zero; 2618 2619 /* 2620 * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one). 2621 * Bit 32:63 -> 1 if the control bit can be one. 2622 */ 2623 return must_be_one | (((uint64_t)can_be_one) << 32); 2624 } 2625 2626 #define VMCS12_MAX_FIELD_INDEX (0x17) 2627 2628 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f) 2629 { 2630 uint64_t kvm_vmx_basic = 2631 kvm_arch_get_supported_msr_feature(kvm_state, 2632 MSR_IA32_VMX_BASIC); 2633 2634 if (!kvm_vmx_basic) { 2635 /* If the kernel doesn't support VMX feature (kvm_intel.nested=0), 2636 * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail. 2637 */ 2638 return; 2639 } 2640 2641 uint64_t kvm_vmx_misc = 2642 kvm_arch_get_supported_msr_feature(kvm_state, 2643 MSR_IA32_VMX_MISC); 2644 uint64_t kvm_vmx_ept_vpid = 2645 kvm_arch_get_supported_msr_feature(kvm_state, 2646 MSR_IA32_VMX_EPT_VPID_CAP); 2647 2648 /* 2649 * If the guest is 64-bit, a value of 1 is allowed for the host address 2650 * space size vmexit control. 2651 */ 2652 uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM 2653 ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0; 2654 2655 /* 2656 * Bits 0-30, 32-44 and 50-53 come from the host. KVM should 2657 * not change them for backwards compatibility. 2658 */ 2659 uint64_t fixed_vmx_basic = kvm_vmx_basic & 2660 (MSR_VMX_BASIC_VMCS_REVISION_MASK | 2661 MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK | 2662 MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK); 2663 2664 /* 2665 * Same for bits 0-4 and 25-27. Bits 16-24 (CR3 target count) can 2666 * change in the future but are always zero for now, clear them to be 2667 * future proof. Bits 32-63 in theory could change, though KVM does 2668 * not support dual-monitor treatment and probably never will; mask 2669 * them out as well. 2670 */ 2671 uint64_t fixed_vmx_misc = kvm_vmx_misc & 2672 (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK | 2673 MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK); 2674 2675 /* 2676 * EPT memory types should not change either, so we do not bother 2677 * adding features for them. 2678 */ 2679 uint64_t fixed_vmx_ept_mask = 2680 (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ? 2681 MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0); 2682 uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask; 2683 2684 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 2685 make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS, 2686 f[FEAT_VMX_PROCBASED_CTLS])); 2687 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS, 2688 make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS, 2689 f[FEAT_VMX_PINBASED_CTLS])); 2690 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS, 2691 make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS, 2692 f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit); 2693 kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS, 2694 make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS, 2695 f[FEAT_VMX_ENTRY_CTLS])); 2696 kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2, 2697 make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2, 2698 f[FEAT_VMX_SECONDARY_CTLS])); 2699 kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP, 2700 f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid); 2701 kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC, 2702 f[FEAT_VMX_BASIC] | fixed_vmx_basic); 2703 kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC, 2704 f[FEAT_VMX_MISC] | fixed_vmx_misc); 2705 if (has_msr_vmx_vmfunc) { 2706 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]); 2707 } 2708 2709 /* 2710 * Just to be safe, write these with constant values. The CRn_FIXED1 2711 * MSRs are generated by KVM based on the vCPU's CPUID. 2712 */ 2713 kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0, 2714 CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK); 2715 kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0, 2716 CR4_VMXE_MASK); 2717 kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 2718 VMCS12_MAX_FIELD_INDEX << 1); 2719 } 2720 2721 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f) 2722 { 2723 uint64_t kvm_perf_cap = 2724 kvm_arch_get_supported_msr_feature(kvm_state, 2725 MSR_IA32_PERF_CAPABILITIES); 2726 2727 if (kvm_perf_cap) { 2728 kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES, 2729 kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]); 2730 } 2731 } 2732 2733 static int kvm_buf_set_msrs(X86CPU *cpu) 2734 { 2735 int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf); 2736 if (ret < 0) { 2737 return ret; 2738 } 2739 2740 if (ret < cpu->kvm_msr_buf->nmsrs) { 2741 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; 2742 error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64, 2743 (uint32_t)e->index, (uint64_t)e->data); 2744 } 2745 2746 assert(ret == cpu->kvm_msr_buf->nmsrs); 2747 return 0; 2748 } 2749 2750 static void kvm_init_msrs(X86CPU *cpu) 2751 { 2752 CPUX86State *env = &cpu->env; 2753 2754 kvm_msr_buf_reset(cpu); 2755 if (has_msr_arch_capabs) { 2756 kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES, 2757 env->features[FEAT_ARCH_CAPABILITIES]); 2758 } 2759 2760 if (has_msr_core_capabs) { 2761 kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY, 2762 env->features[FEAT_CORE_CAPABILITY]); 2763 } 2764 2765 if (has_msr_perf_capabs && cpu->enable_pmu) { 2766 kvm_msr_entry_add_perf(cpu, env->features); 2767 } 2768 2769 if (has_msr_ucode_rev) { 2770 kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev); 2771 } 2772 2773 /* 2774 * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but 2775 * all kernels with MSR features should have them. 2776 */ 2777 if (kvm_feature_msrs && cpu_has_vmx(env)) { 2778 kvm_msr_entry_add_vmx(cpu, env->features); 2779 } 2780 2781 assert(kvm_buf_set_msrs(cpu) == 0); 2782 } 2783 2784 static int kvm_put_msrs(X86CPU *cpu, int level) 2785 { 2786 CPUX86State *env = &cpu->env; 2787 int i; 2788 2789 kvm_msr_buf_reset(cpu); 2790 2791 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs); 2792 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp); 2793 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip); 2794 kvm_msr_entry_add(cpu, MSR_PAT, env->pat); 2795 if (has_msr_star) { 2796 kvm_msr_entry_add(cpu, MSR_STAR, env->star); 2797 } 2798 if (has_msr_hsave_pa) { 2799 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave); 2800 } 2801 if (has_msr_tsc_aux) { 2802 kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux); 2803 } 2804 if (has_msr_tsc_adjust) { 2805 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust); 2806 } 2807 if (has_msr_misc_enable) { 2808 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 2809 env->msr_ia32_misc_enable); 2810 } 2811 if (has_msr_smbase) { 2812 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase); 2813 } 2814 if (has_msr_smi_count) { 2815 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count); 2816 } 2817 if (has_msr_bndcfgs) { 2818 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs); 2819 } 2820 if (has_msr_xss) { 2821 kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss); 2822 } 2823 if (has_msr_umwait) { 2824 kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait); 2825 } 2826 if (has_msr_spec_ctrl) { 2827 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl); 2828 } 2829 if (has_msr_tsx_ctrl) { 2830 kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl); 2831 } 2832 if (has_msr_virt_ssbd) { 2833 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd); 2834 } 2835 2836 #ifdef TARGET_X86_64 2837 if (lm_capable_kernel) { 2838 kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar); 2839 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase); 2840 kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask); 2841 kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar); 2842 } 2843 #endif 2844 2845 /* 2846 * The following MSRs have side effects on the guest or are too heavy 2847 * for normal writeback. Limit them to reset or full state updates. 2848 */ 2849 if (level >= KVM_PUT_RESET_STATE) { 2850 kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc); 2851 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr); 2852 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr); 2853 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) { 2854 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr); 2855 } 2856 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) { 2857 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr); 2858 } 2859 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) { 2860 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr); 2861 } 2862 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) { 2863 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr); 2864 } 2865 2866 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) { 2867 kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr); 2868 } 2869 2870 if (has_architectural_pmu_version > 0) { 2871 if (has_architectural_pmu_version > 1) { 2872 /* Stop the counter. */ 2873 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); 2874 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0); 2875 } 2876 2877 /* Set the counter values. */ 2878 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) { 2879 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 2880 env->msr_fixed_counters[i]); 2881 } 2882 for (i = 0; i < num_architectural_pmu_gp_counters; i++) { 2883 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 2884 env->msr_gp_counters[i]); 2885 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 2886 env->msr_gp_evtsel[i]); 2887 } 2888 if (has_architectural_pmu_version > 1) { 2889 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 2890 env->msr_global_status); 2891 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 2892 env->msr_global_ovf_ctrl); 2893 2894 /* Now start the PMU. */ 2895 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 2896 env->msr_fixed_ctr_ctrl); 2897 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 2898 env->msr_global_ctrl); 2899 } 2900 } 2901 /* 2902 * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add, 2903 * only sync them to KVM on the first cpu 2904 */ 2905 if (current_cpu == first_cpu) { 2906 if (has_msr_hv_hypercall) { 2907 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 2908 env->msr_hv_guest_os_id); 2909 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 2910 env->msr_hv_hypercall); 2911 } 2912 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) { 2913 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 2914 env->msr_hv_tsc); 2915 } 2916 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) { 2917 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 2918 env->msr_hv_reenlightenment_control); 2919 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 2920 env->msr_hv_tsc_emulation_control); 2921 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 2922 env->msr_hv_tsc_emulation_status); 2923 } 2924 } 2925 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) { 2926 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 2927 env->msr_hv_vapic); 2928 } 2929 if (has_msr_hv_crash) { 2930 int j; 2931 2932 for (j = 0; j < HV_CRASH_PARAMS; j++) 2933 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 2934 env->msr_hv_crash_params[j]); 2935 2936 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY); 2937 } 2938 if (has_msr_hv_runtime) { 2939 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime); 2940 } 2941 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) 2942 && hv_vpindex_settable) { 2943 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX, 2944 hyperv_vp_index(CPU(cpu))); 2945 } 2946 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 2947 int j; 2948 2949 kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION); 2950 2951 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 2952 env->msr_hv_synic_control); 2953 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 2954 env->msr_hv_synic_evt_page); 2955 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 2956 env->msr_hv_synic_msg_page); 2957 2958 for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) { 2959 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j, 2960 env->msr_hv_synic_sint[j]); 2961 } 2962 } 2963 if (has_msr_hv_stimer) { 2964 int j; 2965 2966 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) { 2967 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2, 2968 env->msr_hv_stimer_config[j]); 2969 } 2970 2971 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) { 2972 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2, 2973 env->msr_hv_stimer_count[j]); 2974 } 2975 } 2976 if (env->features[FEAT_1_EDX] & CPUID_MTRR) { 2977 uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits); 2978 2979 kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype); 2980 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]); 2981 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]); 2982 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]); 2983 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]); 2984 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]); 2985 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]); 2986 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]); 2987 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]); 2988 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]); 2989 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]); 2990 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]); 2991 for (i = 0; i < MSR_MTRRcap_VCNT; i++) { 2992 /* The CPU GPs if we write to a bit above the physical limit of 2993 * the host CPU (and KVM emulates that) 2994 */ 2995 uint64_t mask = env->mtrr_var[i].mask; 2996 mask &= phys_mask; 2997 2998 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 2999 env->mtrr_var[i].base); 3000 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask); 3001 } 3002 } 3003 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) { 3004 int addr_num = kvm_arch_get_supported_cpuid(kvm_state, 3005 0x14, 1, R_EAX) & 0x7; 3006 3007 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 3008 env->msr_rtit_ctrl); 3009 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 3010 env->msr_rtit_status); 3011 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 3012 env->msr_rtit_output_base); 3013 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 3014 env->msr_rtit_output_mask); 3015 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 3016 env->msr_rtit_cr3_match); 3017 for (i = 0; i < addr_num; i++) { 3018 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 3019 env->msr_rtit_addrs[i]); 3020 } 3021 } 3022 3023 /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see 3024 * kvm_put_msr_feature_control. */ 3025 } 3026 3027 if (env->mcg_cap) { 3028 int i; 3029 3030 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status); 3031 kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl); 3032 if (has_msr_mcg_ext_ctl) { 3033 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl); 3034 } 3035 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) { 3036 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]); 3037 } 3038 } 3039 3040 return kvm_buf_set_msrs(cpu); 3041 } 3042 3043 3044 static int kvm_get_fpu(X86CPU *cpu) 3045 { 3046 CPUX86State *env = &cpu->env; 3047 struct kvm_fpu fpu; 3048 int i, ret; 3049 3050 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu); 3051 if (ret < 0) { 3052 return ret; 3053 } 3054 3055 env->fpstt = (fpu.fsw >> 11) & 7; 3056 env->fpus = fpu.fsw; 3057 env->fpuc = fpu.fcw; 3058 env->fpop = fpu.last_opcode; 3059 env->fpip = fpu.last_ip; 3060 env->fpdp = fpu.last_dp; 3061 for (i = 0; i < 8; ++i) { 3062 env->fptags[i] = !((fpu.ftwx >> i) & 1); 3063 } 3064 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs); 3065 for (i = 0; i < CPU_NB_REGS; i++) { 3066 env->xmm_regs[i].ZMM_Q(0) = ldq_p(&fpu.xmm[i][0]); 3067 env->xmm_regs[i].ZMM_Q(1) = ldq_p(&fpu.xmm[i][8]); 3068 } 3069 env->mxcsr = fpu.mxcsr; 3070 3071 return 0; 3072 } 3073 3074 static int kvm_get_xsave(X86CPU *cpu) 3075 { 3076 CPUX86State *env = &cpu->env; 3077 X86XSaveArea *xsave = env->xsave_buf; 3078 int ret; 3079 3080 if (!has_xsave) { 3081 return kvm_get_fpu(cpu); 3082 } 3083 3084 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave); 3085 if (ret < 0) { 3086 return ret; 3087 } 3088 x86_cpu_xrstor_all_areas(cpu, xsave); 3089 3090 return 0; 3091 } 3092 3093 static int kvm_get_xcrs(X86CPU *cpu) 3094 { 3095 CPUX86State *env = &cpu->env; 3096 int i, ret; 3097 struct kvm_xcrs xcrs; 3098 3099 if (!has_xcrs) { 3100 return 0; 3101 } 3102 3103 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs); 3104 if (ret < 0) { 3105 return ret; 3106 } 3107 3108 for (i = 0; i < xcrs.nr_xcrs; i++) { 3109 /* Only support xcr0 now */ 3110 if (xcrs.xcrs[i].xcr == 0) { 3111 env->xcr0 = xcrs.xcrs[i].value; 3112 break; 3113 } 3114 } 3115 return 0; 3116 } 3117 3118 static int kvm_get_sregs(X86CPU *cpu) 3119 { 3120 CPUX86State *env = &cpu->env; 3121 struct kvm_sregs sregs; 3122 int bit, i, ret; 3123 3124 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs); 3125 if (ret < 0) { 3126 return ret; 3127 } 3128 3129 /* There can only be one pending IRQ set in the bitmap at a time, so try 3130 to find it and save its number instead (-1 for none). */ 3131 env->interrupt_injected = -1; 3132 for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) { 3133 if (sregs.interrupt_bitmap[i]) { 3134 bit = ctz64(sregs.interrupt_bitmap[i]); 3135 env->interrupt_injected = i * 64 + bit; 3136 break; 3137 } 3138 } 3139 3140 get_seg(&env->segs[R_CS], &sregs.cs); 3141 get_seg(&env->segs[R_DS], &sregs.ds); 3142 get_seg(&env->segs[R_ES], &sregs.es); 3143 get_seg(&env->segs[R_FS], &sregs.fs); 3144 get_seg(&env->segs[R_GS], &sregs.gs); 3145 get_seg(&env->segs[R_SS], &sregs.ss); 3146 3147 get_seg(&env->tr, &sregs.tr); 3148 get_seg(&env->ldt, &sregs.ldt); 3149 3150 env->idt.limit = sregs.idt.limit; 3151 env->idt.base = sregs.idt.base; 3152 env->gdt.limit = sregs.gdt.limit; 3153 env->gdt.base = sregs.gdt.base; 3154 3155 env->cr[0] = sregs.cr0; 3156 env->cr[2] = sregs.cr2; 3157 env->cr[3] = sregs.cr3; 3158 env->cr[4] = sregs.cr4; 3159 3160 env->efer = sregs.efer; 3161 3162 /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */ 3163 x86_update_hflags(env); 3164 3165 return 0; 3166 } 3167 3168 static int kvm_get_msrs(X86CPU *cpu) 3169 { 3170 CPUX86State *env = &cpu->env; 3171 struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries; 3172 int ret, i; 3173 uint64_t mtrr_top_bits; 3174 3175 kvm_msr_buf_reset(cpu); 3176 3177 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0); 3178 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0); 3179 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0); 3180 kvm_msr_entry_add(cpu, MSR_PAT, 0); 3181 if (has_msr_star) { 3182 kvm_msr_entry_add(cpu, MSR_STAR, 0); 3183 } 3184 if (has_msr_hsave_pa) { 3185 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0); 3186 } 3187 if (has_msr_tsc_aux) { 3188 kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0); 3189 } 3190 if (has_msr_tsc_adjust) { 3191 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0); 3192 } 3193 if (has_msr_tsc_deadline) { 3194 kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0); 3195 } 3196 if (has_msr_misc_enable) { 3197 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0); 3198 } 3199 if (has_msr_smbase) { 3200 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0); 3201 } 3202 if (has_msr_smi_count) { 3203 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0); 3204 } 3205 if (has_msr_feature_control) { 3206 kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0); 3207 } 3208 if (has_msr_bndcfgs) { 3209 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0); 3210 } 3211 if (has_msr_xss) { 3212 kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0); 3213 } 3214 if (has_msr_umwait) { 3215 kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0); 3216 } 3217 if (has_msr_spec_ctrl) { 3218 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0); 3219 } 3220 if (has_msr_tsx_ctrl) { 3221 kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0); 3222 } 3223 if (has_msr_virt_ssbd) { 3224 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0); 3225 } 3226 if (!env->tsc_valid) { 3227 kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0); 3228 env->tsc_valid = !runstate_is_running(); 3229 } 3230 3231 #ifdef TARGET_X86_64 3232 if (lm_capable_kernel) { 3233 kvm_msr_entry_add(cpu, MSR_CSTAR, 0); 3234 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0); 3235 kvm_msr_entry_add(cpu, MSR_FMASK, 0); 3236 kvm_msr_entry_add(cpu, MSR_LSTAR, 0); 3237 } 3238 #endif 3239 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0); 3240 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0); 3241 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) { 3242 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0); 3243 } 3244 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) { 3245 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0); 3246 } 3247 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) { 3248 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0); 3249 } 3250 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) { 3251 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0); 3252 } 3253 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) { 3254 kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1); 3255 } 3256 if (has_architectural_pmu_version > 0) { 3257 if (has_architectural_pmu_version > 1) { 3258 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0); 3259 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0); 3260 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0); 3261 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0); 3262 } 3263 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) { 3264 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0); 3265 } 3266 for (i = 0; i < num_architectural_pmu_gp_counters; i++) { 3267 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0); 3268 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0); 3269 } 3270 } 3271 3272 if (env->mcg_cap) { 3273 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0); 3274 kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0); 3275 if (has_msr_mcg_ext_ctl) { 3276 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0); 3277 } 3278 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) { 3279 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0); 3280 } 3281 } 3282 3283 if (has_msr_hv_hypercall) { 3284 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0); 3285 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0); 3286 } 3287 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) { 3288 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0); 3289 } 3290 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) { 3291 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0); 3292 } 3293 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) { 3294 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0); 3295 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0); 3296 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0); 3297 } 3298 if (has_msr_hv_crash) { 3299 int j; 3300 3301 for (j = 0; j < HV_CRASH_PARAMS; j++) { 3302 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0); 3303 } 3304 } 3305 if (has_msr_hv_runtime) { 3306 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0); 3307 } 3308 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) { 3309 uint32_t msr; 3310 3311 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0); 3312 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0); 3313 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0); 3314 for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) { 3315 kvm_msr_entry_add(cpu, msr, 0); 3316 } 3317 } 3318 if (has_msr_hv_stimer) { 3319 uint32_t msr; 3320 3321 for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT; 3322 msr++) { 3323 kvm_msr_entry_add(cpu, msr, 0); 3324 } 3325 } 3326 if (env->features[FEAT_1_EDX] & CPUID_MTRR) { 3327 kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0); 3328 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0); 3329 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0); 3330 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0); 3331 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0); 3332 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0); 3333 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0); 3334 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0); 3335 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0); 3336 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0); 3337 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0); 3338 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0); 3339 for (i = 0; i < MSR_MTRRcap_VCNT; i++) { 3340 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0); 3341 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0); 3342 } 3343 } 3344 3345 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) { 3346 int addr_num = 3347 kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7; 3348 3349 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0); 3350 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0); 3351 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0); 3352 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0); 3353 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0); 3354 for (i = 0; i < addr_num; i++) { 3355 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0); 3356 } 3357 } 3358 3359 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf); 3360 if (ret < 0) { 3361 return ret; 3362 } 3363 3364 if (ret < cpu->kvm_msr_buf->nmsrs) { 3365 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret]; 3366 error_report("error: failed to get MSR 0x%" PRIx32, 3367 (uint32_t)e->index); 3368 } 3369 3370 assert(ret == cpu->kvm_msr_buf->nmsrs); 3371 /* 3372 * MTRR masks: Each mask consists of 5 parts 3373 * a 10..0: must be zero 3374 * b 11 : valid bit 3375 * c n-1.12: actual mask bits 3376 * d 51..n: reserved must be zero 3377 * e 63.52: reserved must be zero 3378 * 3379 * 'n' is the number of physical bits supported by the CPU and is 3380 * apparently always <= 52. We know our 'n' but don't know what 3381 * the destinations 'n' is; it might be smaller, in which case 3382 * it masks (c) on loading. It might be larger, in which case 3383 * we fill 'd' so that d..c is consistent irrespetive of the 'n' 3384 * we're migrating to. 3385 */ 3386 3387 if (cpu->fill_mtrr_mask) { 3388 QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52); 3389 assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS); 3390 mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits); 3391 } else { 3392 mtrr_top_bits = 0; 3393 } 3394 3395 for (i = 0; i < ret; i++) { 3396 uint32_t index = msrs[i].index; 3397 switch (index) { 3398 case MSR_IA32_SYSENTER_CS: 3399 env->sysenter_cs = msrs[i].data; 3400 break; 3401 case MSR_IA32_SYSENTER_ESP: 3402 env->sysenter_esp = msrs[i].data; 3403 break; 3404 case MSR_IA32_SYSENTER_EIP: 3405 env->sysenter_eip = msrs[i].data; 3406 break; 3407 case MSR_PAT: 3408 env->pat = msrs[i].data; 3409 break; 3410 case MSR_STAR: 3411 env->star = msrs[i].data; 3412 break; 3413 #ifdef TARGET_X86_64 3414 case MSR_CSTAR: 3415 env->cstar = msrs[i].data; 3416 break; 3417 case MSR_KERNELGSBASE: 3418 env->kernelgsbase = msrs[i].data; 3419 break; 3420 case MSR_FMASK: 3421 env->fmask = msrs[i].data; 3422 break; 3423 case MSR_LSTAR: 3424 env->lstar = msrs[i].data; 3425 break; 3426 #endif 3427 case MSR_IA32_TSC: 3428 env->tsc = msrs[i].data; 3429 break; 3430 case MSR_TSC_AUX: 3431 env->tsc_aux = msrs[i].data; 3432 break; 3433 case MSR_TSC_ADJUST: 3434 env->tsc_adjust = msrs[i].data; 3435 break; 3436 case MSR_IA32_TSCDEADLINE: 3437 env->tsc_deadline = msrs[i].data; 3438 break; 3439 case MSR_VM_HSAVE_PA: 3440 env->vm_hsave = msrs[i].data; 3441 break; 3442 case MSR_KVM_SYSTEM_TIME: 3443 env->system_time_msr = msrs[i].data; 3444 break; 3445 case MSR_KVM_WALL_CLOCK: 3446 env->wall_clock_msr = msrs[i].data; 3447 break; 3448 case MSR_MCG_STATUS: 3449 env->mcg_status = msrs[i].data; 3450 break; 3451 case MSR_MCG_CTL: 3452 env->mcg_ctl = msrs[i].data; 3453 break; 3454 case MSR_MCG_EXT_CTL: 3455 env->mcg_ext_ctl = msrs[i].data; 3456 break; 3457 case MSR_IA32_MISC_ENABLE: 3458 env->msr_ia32_misc_enable = msrs[i].data; 3459 break; 3460 case MSR_IA32_SMBASE: 3461 env->smbase = msrs[i].data; 3462 break; 3463 case MSR_SMI_COUNT: 3464 env->msr_smi_count = msrs[i].data; 3465 break; 3466 case MSR_IA32_FEATURE_CONTROL: 3467 env->msr_ia32_feature_control = msrs[i].data; 3468 break; 3469 case MSR_IA32_BNDCFGS: 3470 env->msr_bndcfgs = msrs[i].data; 3471 break; 3472 case MSR_IA32_XSS: 3473 env->xss = msrs[i].data; 3474 break; 3475 case MSR_IA32_UMWAIT_CONTROL: 3476 env->umwait = msrs[i].data; 3477 break; 3478 default: 3479 if (msrs[i].index >= MSR_MC0_CTL && 3480 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) { 3481 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data; 3482 } 3483 break; 3484 case MSR_KVM_ASYNC_PF_EN: 3485 env->async_pf_en_msr = msrs[i].data; 3486 break; 3487 case MSR_KVM_ASYNC_PF_INT: 3488 env->async_pf_int_msr = msrs[i].data; 3489 break; 3490 case MSR_KVM_PV_EOI_EN: 3491 env->pv_eoi_en_msr = msrs[i].data; 3492 break; 3493 case MSR_KVM_STEAL_TIME: 3494 env->steal_time_msr = msrs[i].data; 3495 break; 3496 case MSR_KVM_POLL_CONTROL: { 3497 env->poll_control_msr = msrs[i].data; 3498 break; 3499 } 3500 case MSR_CORE_PERF_FIXED_CTR_CTRL: 3501 env->msr_fixed_ctr_ctrl = msrs[i].data; 3502 break; 3503 case MSR_CORE_PERF_GLOBAL_CTRL: 3504 env->msr_global_ctrl = msrs[i].data; 3505 break; 3506 case MSR_CORE_PERF_GLOBAL_STATUS: 3507 env->msr_global_status = msrs[i].data; 3508 break; 3509 case MSR_CORE_PERF_GLOBAL_OVF_CTRL: 3510 env->msr_global_ovf_ctrl = msrs[i].data; 3511 break; 3512 case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1: 3513 env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data; 3514 break; 3515 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1: 3516 env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data; 3517 break; 3518 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1: 3519 env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data; 3520 break; 3521 case HV_X64_MSR_HYPERCALL: 3522 env->msr_hv_hypercall = msrs[i].data; 3523 break; 3524 case HV_X64_MSR_GUEST_OS_ID: 3525 env->msr_hv_guest_os_id = msrs[i].data; 3526 break; 3527 case HV_X64_MSR_APIC_ASSIST_PAGE: 3528 env->msr_hv_vapic = msrs[i].data; 3529 break; 3530 case HV_X64_MSR_REFERENCE_TSC: 3531 env->msr_hv_tsc = msrs[i].data; 3532 break; 3533 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: 3534 env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data; 3535 break; 3536 case HV_X64_MSR_VP_RUNTIME: 3537 env->msr_hv_runtime = msrs[i].data; 3538 break; 3539 case HV_X64_MSR_SCONTROL: 3540 env->msr_hv_synic_control = msrs[i].data; 3541 break; 3542 case HV_X64_MSR_SIEFP: 3543 env->msr_hv_synic_evt_page = msrs[i].data; 3544 break; 3545 case HV_X64_MSR_SIMP: 3546 env->msr_hv_synic_msg_page = msrs[i].data; 3547 break; 3548 case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: 3549 env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data; 3550 break; 3551 case HV_X64_MSR_STIMER0_CONFIG: 3552 case HV_X64_MSR_STIMER1_CONFIG: 3553 case HV_X64_MSR_STIMER2_CONFIG: 3554 case HV_X64_MSR_STIMER3_CONFIG: 3555 env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] = 3556 msrs[i].data; 3557 break; 3558 case HV_X64_MSR_STIMER0_COUNT: 3559 case HV_X64_MSR_STIMER1_COUNT: 3560 case HV_X64_MSR_STIMER2_COUNT: 3561 case HV_X64_MSR_STIMER3_COUNT: 3562 env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] = 3563 msrs[i].data; 3564 break; 3565 case HV_X64_MSR_REENLIGHTENMENT_CONTROL: 3566 env->msr_hv_reenlightenment_control = msrs[i].data; 3567 break; 3568 case HV_X64_MSR_TSC_EMULATION_CONTROL: 3569 env->msr_hv_tsc_emulation_control = msrs[i].data; 3570 break; 3571 case HV_X64_MSR_TSC_EMULATION_STATUS: 3572 env->msr_hv_tsc_emulation_status = msrs[i].data; 3573 break; 3574 case MSR_MTRRdefType: 3575 env->mtrr_deftype = msrs[i].data; 3576 break; 3577 case MSR_MTRRfix64K_00000: 3578 env->mtrr_fixed[0] = msrs[i].data; 3579 break; 3580 case MSR_MTRRfix16K_80000: 3581 env->mtrr_fixed[1] = msrs[i].data; 3582 break; 3583 case MSR_MTRRfix16K_A0000: 3584 env->mtrr_fixed[2] = msrs[i].data; 3585 break; 3586 case MSR_MTRRfix4K_C0000: 3587 env->mtrr_fixed[3] = msrs[i].data; 3588 break; 3589 case MSR_MTRRfix4K_C8000: 3590 env->mtrr_fixed[4] = msrs[i].data; 3591 break; 3592 case MSR_MTRRfix4K_D0000: 3593 env->mtrr_fixed[5] = msrs[i].data; 3594 break; 3595 case MSR_MTRRfix4K_D8000: 3596 env->mtrr_fixed[6] = msrs[i].data; 3597 break; 3598 case MSR_MTRRfix4K_E0000: 3599 env->mtrr_fixed[7] = msrs[i].data; 3600 break; 3601 case MSR_MTRRfix4K_E8000: 3602 env->mtrr_fixed[8] = msrs[i].data; 3603 break; 3604 case MSR_MTRRfix4K_F0000: 3605 env->mtrr_fixed[9] = msrs[i].data; 3606 break; 3607 case MSR_MTRRfix4K_F8000: 3608 env->mtrr_fixed[10] = msrs[i].data; 3609 break; 3610 case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1): 3611 if (index & 1) { 3612 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data | 3613 mtrr_top_bits; 3614 } else { 3615 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data; 3616 } 3617 break; 3618 case MSR_IA32_SPEC_CTRL: 3619 env->spec_ctrl = msrs[i].data; 3620 break; 3621 case MSR_IA32_TSX_CTRL: 3622 env->tsx_ctrl = msrs[i].data; 3623 break; 3624 case MSR_VIRT_SSBD: 3625 env->virt_ssbd = msrs[i].data; 3626 break; 3627 case MSR_IA32_RTIT_CTL: 3628 env->msr_rtit_ctrl = msrs[i].data; 3629 break; 3630 case MSR_IA32_RTIT_STATUS: 3631 env->msr_rtit_status = msrs[i].data; 3632 break; 3633 case MSR_IA32_RTIT_OUTPUT_BASE: 3634 env->msr_rtit_output_base = msrs[i].data; 3635 break; 3636 case MSR_IA32_RTIT_OUTPUT_MASK: 3637 env->msr_rtit_output_mask = msrs[i].data; 3638 break; 3639 case MSR_IA32_RTIT_CR3_MATCH: 3640 env->msr_rtit_cr3_match = msrs[i].data; 3641 break; 3642 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: 3643 env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data; 3644 break; 3645 } 3646 } 3647 3648 return 0; 3649 } 3650 3651 static int kvm_put_mp_state(X86CPU *cpu) 3652 { 3653 struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state }; 3654 3655 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state); 3656 } 3657 3658 static int kvm_get_mp_state(X86CPU *cpu) 3659 { 3660 CPUState *cs = CPU(cpu); 3661 CPUX86State *env = &cpu->env; 3662 struct kvm_mp_state mp_state; 3663 int ret; 3664 3665 ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state); 3666 if (ret < 0) { 3667 return ret; 3668 } 3669 env->mp_state = mp_state.mp_state; 3670 if (kvm_irqchip_in_kernel()) { 3671 cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED); 3672 } 3673 return 0; 3674 } 3675 3676 static int kvm_get_apic(X86CPU *cpu) 3677 { 3678 DeviceState *apic = cpu->apic_state; 3679 struct kvm_lapic_state kapic; 3680 int ret; 3681 3682 if (apic && kvm_irqchip_in_kernel()) { 3683 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic); 3684 if (ret < 0) { 3685 return ret; 3686 } 3687 3688 kvm_get_apic_state(apic, &kapic); 3689 } 3690 return 0; 3691 } 3692 3693 static int kvm_put_vcpu_events(X86CPU *cpu, int level) 3694 { 3695 CPUState *cs = CPU(cpu); 3696 CPUX86State *env = &cpu->env; 3697 struct kvm_vcpu_events events = {}; 3698 3699 if (!kvm_has_vcpu_events()) { 3700 return 0; 3701 } 3702 3703 events.flags = 0; 3704 3705 if (has_exception_payload) { 3706 events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD; 3707 events.exception.pending = env->exception_pending; 3708 events.exception_has_payload = env->exception_has_payload; 3709 events.exception_payload = env->exception_payload; 3710 } 3711 events.exception.nr = env->exception_nr; 3712 events.exception.injected = env->exception_injected; 3713 events.exception.has_error_code = env->has_error_code; 3714 events.exception.error_code = env->error_code; 3715 3716 events.interrupt.injected = (env->interrupt_injected >= 0); 3717 events.interrupt.nr = env->interrupt_injected; 3718 events.interrupt.soft = env->soft_interrupt; 3719 3720 events.nmi.injected = env->nmi_injected; 3721 events.nmi.pending = env->nmi_pending; 3722 events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK); 3723 3724 events.sipi_vector = env->sipi_vector; 3725 3726 if (has_msr_smbase) { 3727 events.smi.smm = !!(env->hflags & HF_SMM_MASK); 3728 events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK); 3729 if (kvm_irqchip_in_kernel()) { 3730 /* As soon as these are moved to the kernel, remove them 3731 * from cs->interrupt_request. 3732 */ 3733 events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI; 3734 events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT; 3735 cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI); 3736 } else { 3737 /* Keep these in cs->interrupt_request. */ 3738 events.smi.pending = 0; 3739 events.smi.latched_init = 0; 3740 } 3741 /* Stop SMI delivery on old machine types to avoid a reboot 3742 * on an inward migration of an old VM. 3743 */ 3744 if (!cpu->kvm_no_smi_migration) { 3745 events.flags |= KVM_VCPUEVENT_VALID_SMM; 3746 } 3747 } 3748 3749 if (level >= KVM_PUT_RESET_STATE) { 3750 events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING; 3751 if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { 3752 events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR; 3753 } 3754 } 3755 3756 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events); 3757 } 3758 3759 static int kvm_get_vcpu_events(X86CPU *cpu) 3760 { 3761 CPUX86State *env = &cpu->env; 3762 struct kvm_vcpu_events events; 3763 int ret; 3764 3765 if (!kvm_has_vcpu_events()) { 3766 return 0; 3767 } 3768 3769 memset(&events, 0, sizeof(events)); 3770 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events); 3771 if (ret < 0) { 3772 return ret; 3773 } 3774 3775 if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) { 3776 env->exception_pending = events.exception.pending; 3777 env->exception_has_payload = events.exception_has_payload; 3778 env->exception_payload = events.exception_payload; 3779 } else { 3780 env->exception_pending = 0; 3781 env->exception_has_payload = false; 3782 } 3783 env->exception_injected = events.exception.injected; 3784 env->exception_nr = 3785 (env->exception_pending || env->exception_injected) ? 3786 events.exception.nr : -1; 3787 env->has_error_code = events.exception.has_error_code; 3788 env->error_code = events.exception.error_code; 3789 3790 env->interrupt_injected = 3791 events.interrupt.injected ? events.interrupt.nr : -1; 3792 env->soft_interrupt = events.interrupt.soft; 3793 3794 env->nmi_injected = events.nmi.injected; 3795 env->nmi_pending = events.nmi.pending; 3796 if (events.nmi.masked) { 3797 env->hflags2 |= HF2_NMI_MASK; 3798 } else { 3799 env->hflags2 &= ~HF2_NMI_MASK; 3800 } 3801 3802 if (events.flags & KVM_VCPUEVENT_VALID_SMM) { 3803 if (events.smi.smm) { 3804 env->hflags |= HF_SMM_MASK; 3805 } else { 3806 env->hflags &= ~HF_SMM_MASK; 3807 } 3808 if (events.smi.pending) { 3809 cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI); 3810 } else { 3811 cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI); 3812 } 3813 if (events.smi.smm_inside_nmi) { 3814 env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK; 3815 } else { 3816 env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK; 3817 } 3818 if (events.smi.latched_init) { 3819 cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT); 3820 } else { 3821 cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT); 3822 } 3823 } 3824 3825 env->sipi_vector = events.sipi_vector; 3826 3827 return 0; 3828 } 3829 3830 static int kvm_guest_debug_workarounds(X86CPU *cpu) 3831 { 3832 CPUState *cs = CPU(cpu); 3833 CPUX86State *env = &cpu->env; 3834 int ret = 0; 3835 unsigned long reinject_trap = 0; 3836 3837 if (!kvm_has_vcpu_events()) { 3838 if (env->exception_nr == EXCP01_DB) { 3839 reinject_trap = KVM_GUESTDBG_INJECT_DB; 3840 } else if (env->exception_injected == EXCP03_INT3) { 3841 reinject_trap = KVM_GUESTDBG_INJECT_BP; 3842 } 3843 kvm_reset_exception(env); 3844 } 3845 3846 /* 3847 * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF 3848 * injected via SET_GUEST_DEBUG while updating GP regs. Work around this 3849 * by updating the debug state once again if single-stepping is on. 3850 * Another reason to call kvm_update_guest_debug here is a pending debug 3851 * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to 3852 * reinject them via SET_GUEST_DEBUG. 3853 */ 3854 if (reinject_trap || 3855 (!kvm_has_robust_singlestep() && cs->singlestep_enabled)) { 3856 ret = kvm_update_guest_debug(cs, reinject_trap); 3857 } 3858 return ret; 3859 } 3860 3861 static int kvm_put_debugregs(X86CPU *cpu) 3862 { 3863 CPUX86State *env = &cpu->env; 3864 struct kvm_debugregs dbgregs; 3865 int i; 3866 3867 if (!kvm_has_debugregs()) { 3868 return 0; 3869 } 3870 3871 memset(&dbgregs, 0, sizeof(dbgregs)); 3872 for (i = 0; i < 4; i++) { 3873 dbgregs.db[i] = env->dr[i]; 3874 } 3875 dbgregs.dr6 = env->dr[6]; 3876 dbgregs.dr7 = env->dr[7]; 3877 dbgregs.flags = 0; 3878 3879 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs); 3880 } 3881 3882 static int kvm_get_debugregs(X86CPU *cpu) 3883 { 3884 CPUX86State *env = &cpu->env; 3885 struct kvm_debugregs dbgregs; 3886 int i, ret; 3887 3888 if (!kvm_has_debugregs()) { 3889 return 0; 3890 } 3891 3892 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs); 3893 if (ret < 0) { 3894 return ret; 3895 } 3896 for (i = 0; i < 4; i++) { 3897 env->dr[i] = dbgregs.db[i]; 3898 } 3899 env->dr[4] = env->dr[6] = dbgregs.dr6; 3900 env->dr[5] = env->dr[7] = dbgregs.dr7; 3901 3902 return 0; 3903 } 3904 3905 static int kvm_put_nested_state(X86CPU *cpu) 3906 { 3907 CPUX86State *env = &cpu->env; 3908 int max_nested_state_len = kvm_max_nested_state_length(); 3909 3910 if (!env->nested_state) { 3911 return 0; 3912 } 3913 3914 /* 3915 * Copy flags that are affected by reset from env->hflags and env->hflags2. 3916 */ 3917 if (env->hflags & HF_GUEST_MASK) { 3918 env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE; 3919 } else { 3920 env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE; 3921 } 3922 3923 /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */ 3924 if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) { 3925 env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET; 3926 } else { 3927 env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET; 3928 } 3929 3930 assert(env->nested_state->size <= max_nested_state_len); 3931 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state); 3932 } 3933 3934 static int kvm_get_nested_state(X86CPU *cpu) 3935 { 3936 CPUX86State *env = &cpu->env; 3937 int max_nested_state_len = kvm_max_nested_state_length(); 3938 int ret; 3939 3940 if (!env->nested_state) { 3941 return 0; 3942 } 3943 3944 /* 3945 * It is possible that migration restored a smaller size into 3946 * nested_state->hdr.size than what our kernel support. 3947 * We preserve migration origin nested_state->hdr.size for 3948 * call to KVM_SET_NESTED_STATE but wish that our next call 3949 * to KVM_GET_NESTED_STATE will use max size our kernel support. 3950 */ 3951 env->nested_state->size = max_nested_state_len; 3952 3953 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state); 3954 if (ret < 0) { 3955 return ret; 3956 } 3957 3958 /* 3959 * Copy flags that are affected by reset to env->hflags and env->hflags2. 3960 */ 3961 if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) { 3962 env->hflags |= HF_GUEST_MASK; 3963 } else { 3964 env->hflags &= ~HF_GUEST_MASK; 3965 } 3966 3967 /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */ 3968 if (cpu_has_svm(env)) { 3969 if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) { 3970 env->hflags2 |= HF2_GIF_MASK; 3971 } else { 3972 env->hflags2 &= ~HF2_GIF_MASK; 3973 } 3974 } 3975 3976 return ret; 3977 } 3978 3979 int kvm_arch_put_registers(CPUState *cpu, int level) 3980 { 3981 X86CPU *x86_cpu = X86_CPU(cpu); 3982 int ret; 3983 3984 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); 3985 3986 /* must be before kvm_put_nested_state so that EFER.SVME is set */ 3987 ret = kvm_put_sregs(x86_cpu); 3988 if (ret < 0) { 3989 return ret; 3990 } 3991 3992 if (level >= KVM_PUT_RESET_STATE) { 3993 ret = kvm_put_nested_state(x86_cpu); 3994 if (ret < 0) { 3995 return ret; 3996 } 3997 3998 ret = kvm_put_msr_feature_control(x86_cpu); 3999 if (ret < 0) { 4000 return ret; 4001 } 4002 } 4003 4004 if (level == KVM_PUT_FULL_STATE) { 4005 /* We don't check for kvm_arch_set_tsc_khz() errors here, 4006 * because TSC frequency mismatch shouldn't abort migration, 4007 * unless the user explicitly asked for a more strict TSC 4008 * setting (e.g. using an explicit "tsc-freq" option). 4009 */ 4010 kvm_arch_set_tsc_khz(cpu); 4011 } 4012 4013 ret = kvm_getput_regs(x86_cpu, 1); 4014 if (ret < 0) { 4015 return ret; 4016 } 4017 ret = kvm_put_xsave(x86_cpu); 4018 if (ret < 0) { 4019 return ret; 4020 } 4021 ret = kvm_put_xcrs(x86_cpu); 4022 if (ret < 0) { 4023 return ret; 4024 } 4025 /* must be before kvm_put_msrs */ 4026 ret = kvm_inject_mce_oldstyle(x86_cpu); 4027 if (ret < 0) { 4028 return ret; 4029 } 4030 ret = kvm_put_msrs(x86_cpu, level); 4031 if (ret < 0) { 4032 return ret; 4033 } 4034 ret = kvm_put_vcpu_events(x86_cpu, level); 4035 if (ret < 0) { 4036 return ret; 4037 } 4038 if (level >= KVM_PUT_RESET_STATE) { 4039 ret = kvm_put_mp_state(x86_cpu); 4040 if (ret < 0) { 4041 return ret; 4042 } 4043 } 4044 4045 ret = kvm_put_tscdeadline_msr(x86_cpu); 4046 if (ret < 0) { 4047 return ret; 4048 } 4049 ret = kvm_put_debugregs(x86_cpu); 4050 if (ret < 0) { 4051 return ret; 4052 } 4053 /* must be last */ 4054 ret = kvm_guest_debug_workarounds(x86_cpu); 4055 if (ret < 0) { 4056 return ret; 4057 } 4058 return 0; 4059 } 4060 4061 int kvm_arch_get_registers(CPUState *cs) 4062 { 4063 X86CPU *cpu = X86_CPU(cs); 4064 int ret; 4065 4066 assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs)); 4067 4068 ret = kvm_get_vcpu_events(cpu); 4069 if (ret < 0) { 4070 goto out; 4071 } 4072 /* 4073 * KVM_GET_MPSTATE can modify CS and RIP, call it before 4074 * KVM_GET_REGS and KVM_GET_SREGS. 4075 */ 4076 ret = kvm_get_mp_state(cpu); 4077 if (ret < 0) { 4078 goto out; 4079 } 4080 ret = kvm_getput_regs(cpu, 0); 4081 if (ret < 0) { 4082 goto out; 4083 } 4084 ret = kvm_get_xsave(cpu); 4085 if (ret < 0) { 4086 goto out; 4087 } 4088 ret = kvm_get_xcrs(cpu); 4089 if (ret < 0) { 4090 goto out; 4091 } 4092 ret = kvm_get_sregs(cpu); 4093 if (ret < 0) { 4094 goto out; 4095 } 4096 ret = kvm_get_msrs(cpu); 4097 if (ret < 0) { 4098 goto out; 4099 } 4100 ret = kvm_get_apic(cpu); 4101 if (ret < 0) { 4102 goto out; 4103 } 4104 ret = kvm_get_debugregs(cpu); 4105 if (ret < 0) { 4106 goto out; 4107 } 4108 ret = kvm_get_nested_state(cpu); 4109 if (ret < 0) { 4110 goto out; 4111 } 4112 ret = 0; 4113 out: 4114 cpu_sync_bndcs_hflags(&cpu->env); 4115 return ret; 4116 } 4117 4118 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run) 4119 { 4120 X86CPU *x86_cpu = X86_CPU(cpu); 4121 CPUX86State *env = &x86_cpu->env; 4122 int ret; 4123 4124 /* Inject NMI */ 4125 if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) { 4126 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) { 4127 qemu_mutex_lock_iothread(); 4128 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI; 4129 qemu_mutex_unlock_iothread(); 4130 DPRINTF("injected NMI\n"); 4131 ret = kvm_vcpu_ioctl(cpu, KVM_NMI); 4132 if (ret < 0) { 4133 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n", 4134 strerror(-ret)); 4135 } 4136 } 4137 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) { 4138 qemu_mutex_lock_iothread(); 4139 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; 4140 qemu_mutex_unlock_iothread(); 4141 DPRINTF("injected SMI\n"); 4142 ret = kvm_vcpu_ioctl(cpu, KVM_SMI); 4143 if (ret < 0) { 4144 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n", 4145 strerror(-ret)); 4146 } 4147 } 4148 } 4149 4150 if (!kvm_pic_in_kernel()) { 4151 qemu_mutex_lock_iothread(); 4152 } 4153 4154 /* Force the VCPU out of its inner loop to process any INIT requests 4155 * or (for userspace APIC, but it is cheap to combine the checks here) 4156 * pending TPR access reports. 4157 */ 4158 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { 4159 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) && 4160 !(env->hflags & HF_SMM_MASK)) { 4161 cpu->exit_request = 1; 4162 } 4163 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) { 4164 cpu->exit_request = 1; 4165 } 4166 } 4167 4168 if (!kvm_pic_in_kernel()) { 4169 /* Try to inject an interrupt if the guest can accept it */ 4170 if (run->ready_for_interrupt_injection && 4171 (cpu->interrupt_request & CPU_INTERRUPT_HARD) && 4172 (env->eflags & IF_MASK)) { 4173 int irq; 4174 4175 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD; 4176 irq = cpu_get_pic_interrupt(env); 4177 if (irq >= 0) { 4178 struct kvm_interrupt intr; 4179 4180 intr.irq = irq; 4181 DPRINTF("injected interrupt %d\n", irq); 4182 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr); 4183 if (ret < 0) { 4184 fprintf(stderr, 4185 "KVM: injection failed, interrupt lost (%s)\n", 4186 strerror(-ret)); 4187 } 4188 } 4189 } 4190 4191 /* If we have an interrupt but the guest is not ready to receive an 4192 * interrupt, request an interrupt window exit. This will 4193 * cause a return to userspace as soon as the guest is ready to 4194 * receive interrupts. */ 4195 if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) { 4196 run->request_interrupt_window = 1; 4197 } else { 4198 run->request_interrupt_window = 0; 4199 } 4200 4201 DPRINTF("setting tpr\n"); 4202 run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state); 4203 4204 qemu_mutex_unlock_iothread(); 4205 } 4206 } 4207 4208 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run) 4209 { 4210 X86CPU *x86_cpu = X86_CPU(cpu); 4211 CPUX86State *env = &x86_cpu->env; 4212 4213 if (run->flags & KVM_RUN_X86_SMM) { 4214 env->hflags |= HF_SMM_MASK; 4215 } else { 4216 env->hflags &= ~HF_SMM_MASK; 4217 } 4218 if (run->if_flag) { 4219 env->eflags |= IF_MASK; 4220 } else { 4221 env->eflags &= ~IF_MASK; 4222 } 4223 4224 /* We need to protect the apic state against concurrent accesses from 4225 * different threads in case the userspace irqchip is used. */ 4226 if (!kvm_irqchip_in_kernel()) { 4227 qemu_mutex_lock_iothread(); 4228 } 4229 cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8); 4230 cpu_set_apic_base(x86_cpu->apic_state, run->apic_base); 4231 if (!kvm_irqchip_in_kernel()) { 4232 qemu_mutex_unlock_iothread(); 4233 } 4234 return cpu_get_mem_attrs(env); 4235 } 4236 4237 int kvm_arch_process_async_events(CPUState *cs) 4238 { 4239 X86CPU *cpu = X86_CPU(cs); 4240 CPUX86State *env = &cpu->env; 4241 4242 if (cs->interrupt_request & CPU_INTERRUPT_MCE) { 4243 /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */ 4244 assert(env->mcg_cap); 4245 4246 cs->interrupt_request &= ~CPU_INTERRUPT_MCE; 4247 4248 kvm_cpu_synchronize_state(cs); 4249 4250 if (env->exception_nr == EXCP08_DBLE) { 4251 /* this means triple fault */ 4252 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 4253 cs->exit_request = 1; 4254 return 0; 4255 } 4256 kvm_queue_exception(env, EXCP12_MCHK, 0, 0); 4257 env->has_error_code = 0; 4258 4259 cs->halted = 0; 4260 if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) { 4261 env->mp_state = KVM_MP_STATE_RUNNABLE; 4262 } 4263 } 4264 4265 if ((cs->interrupt_request & CPU_INTERRUPT_INIT) && 4266 !(env->hflags & HF_SMM_MASK)) { 4267 kvm_cpu_synchronize_state(cs); 4268 do_cpu_init(cpu); 4269 } 4270 4271 if (kvm_irqchip_in_kernel()) { 4272 return 0; 4273 } 4274 4275 if (cs->interrupt_request & CPU_INTERRUPT_POLL) { 4276 cs->interrupt_request &= ~CPU_INTERRUPT_POLL; 4277 apic_poll_irq(cpu->apic_state); 4278 } 4279 if (((cs->interrupt_request & CPU_INTERRUPT_HARD) && 4280 (env->eflags & IF_MASK)) || 4281 (cs->interrupt_request & CPU_INTERRUPT_NMI)) { 4282 cs->halted = 0; 4283 } 4284 if (cs->interrupt_request & CPU_INTERRUPT_SIPI) { 4285 kvm_cpu_synchronize_state(cs); 4286 do_cpu_sipi(cpu); 4287 } 4288 if (cs->interrupt_request & CPU_INTERRUPT_TPR) { 4289 cs->interrupt_request &= ~CPU_INTERRUPT_TPR; 4290 kvm_cpu_synchronize_state(cs); 4291 apic_handle_tpr_access_report(cpu->apic_state, env->eip, 4292 env->tpr_access_type); 4293 } 4294 4295 return cs->halted; 4296 } 4297 4298 static int kvm_handle_halt(X86CPU *cpu) 4299 { 4300 CPUState *cs = CPU(cpu); 4301 CPUX86State *env = &cpu->env; 4302 4303 if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) && 4304 (env->eflags & IF_MASK)) && 4305 !(cs->interrupt_request & CPU_INTERRUPT_NMI)) { 4306 cs->halted = 1; 4307 return EXCP_HLT; 4308 } 4309 4310 return 0; 4311 } 4312 4313 static int kvm_handle_tpr_access(X86CPU *cpu) 4314 { 4315 CPUState *cs = CPU(cpu); 4316 struct kvm_run *run = cs->kvm_run; 4317 4318 apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip, 4319 run->tpr_access.is_write ? TPR_ACCESS_WRITE 4320 : TPR_ACCESS_READ); 4321 return 1; 4322 } 4323 4324 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) 4325 { 4326 static const uint8_t int3 = 0xcc; 4327 4328 if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) || 4329 cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) { 4330 return -EINVAL; 4331 } 4332 return 0; 4333 } 4334 4335 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp) 4336 { 4337 uint8_t int3; 4338 4339 if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0) || int3 != 0xcc || 4340 cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) { 4341 return -EINVAL; 4342 } 4343 return 0; 4344 } 4345 4346 static struct { 4347 target_ulong addr; 4348 int len; 4349 int type; 4350 } hw_breakpoint[4]; 4351 4352 static int nb_hw_breakpoint; 4353 4354 static int find_hw_breakpoint(target_ulong addr, int len, int type) 4355 { 4356 int n; 4357 4358 for (n = 0; n < nb_hw_breakpoint; n++) { 4359 if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type && 4360 (hw_breakpoint[n].len == len || len == -1)) { 4361 return n; 4362 } 4363 } 4364 return -1; 4365 } 4366 4367 int kvm_arch_insert_hw_breakpoint(target_ulong addr, 4368 target_ulong len, int type) 4369 { 4370 switch (type) { 4371 case GDB_BREAKPOINT_HW: 4372 len = 1; 4373 break; 4374 case GDB_WATCHPOINT_WRITE: 4375 case GDB_WATCHPOINT_ACCESS: 4376 switch (len) { 4377 case 1: 4378 break; 4379 case 2: 4380 case 4: 4381 case 8: 4382 if (addr & (len - 1)) { 4383 return -EINVAL; 4384 } 4385 break; 4386 default: 4387 return -EINVAL; 4388 } 4389 break; 4390 default: 4391 return -ENOSYS; 4392 } 4393 4394 if (nb_hw_breakpoint == 4) { 4395 return -ENOBUFS; 4396 } 4397 if (find_hw_breakpoint(addr, len, type) >= 0) { 4398 return -EEXIST; 4399 } 4400 hw_breakpoint[nb_hw_breakpoint].addr = addr; 4401 hw_breakpoint[nb_hw_breakpoint].len = len; 4402 hw_breakpoint[nb_hw_breakpoint].type = type; 4403 nb_hw_breakpoint++; 4404 4405 return 0; 4406 } 4407 4408 int kvm_arch_remove_hw_breakpoint(target_ulong addr, 4409 target_ulong len, int type) 4410 { 4411 int n; 4412 4413 n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type); 4414 if (n < 0) { 4415 return -ENOENT; 4416 } 4417 nb_hw_breakpoint--; 4418 hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint]; 4419 4420 return 0; 4421 } 4422 4423 void kvm_arch_remove_all_hw_breakpoints(void) 4424 { 4425 nb_hw_breakpoint = 0; 4426 } 4427 4428 static CPUWatchpoint hw_watchpoint; 4429 4430 static int kvm_handle_debug(X86CPU *cpu, 4431 struct kvm_debug_exit_arch *arch_info) 4432 { 4433 CPUState *cs = CPU(cpu); 4434 CPUX86State *env = &cpu->env; 4435 int ret = 0; 4436 int n; 4437 4438 if (arch_info->exception == EXCP01_DB) { 4439 if (arch_info->dr6 & DR6_BS) { 4440 if (cs->singlestep_enabled) { 4441 ret = EXCP_DEBUG; 4442 } 4443 } else { 4444 for (n = 0; n < 4; n++) { 4445 if (arch_info->dr6 & (1 << n)) { 4446 switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) { 4447 case 0x0: 4448 ret = EXCP_DEBUG; 4449 break; 4450 case 0x1: 4451 ret = EXCP_DEBUG; 4452 cs->watchpoint_hit = &hw_watchpoint; 4453 hw_watchpoint.vaddr = hw_breakpoint[n].addr; 4454 hw_watchpoint.flags = BP_MEM_WRITE; 4455 break; 4456 case 0x3: 4457 ret = EXCP_DEBUG; 4458 cs->watchpoint_hit = &hw_watchpoint; 4459 hw_watchpoint.vaddr = hw_breakpoint[n].addr; 4460 hw_watchpoint.flags = BP_MEM_ACCESS; 4461 break; 4462 } 4463 } 4464 } 4465 } 4466 } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) { 4467 ret = EXCP_DEBUG; 4468 } 4469 if (ret == 0) { 4470 cpu_synchronize_state(cs); 4471 assert(env->exception_nr == -1); 4472 4473 /* pass to guest */ 4474 kvm_queue_exception(env, arch_info->exception, 4475 arch_info->exception == EXCP01_DB, 4476 arch_info->dr6); 4477 env->has_error_code = 0; 4478 } 4479 4480 return ret; 4481 } 4482 4483 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg) 4484 { 4485 const uint8_t type_code[] = { 4486 [GDB_BREAKPOINT_HW] = 0x0, 4487 [GDB_WATCHPOINT_WRITE] = 0x1, 4488 [GDB_WATCHPOINT_ACCESS] = 0x3 4489 }; 4490 const uint8_t len_code[] = { 4491 [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2 4492 }; 4493 int n; 4494 4495 if (kvm_sw_breakpoints_active(cpu)) { 4496 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP; 4497 } 4498 if (nb_hw_breakpoint > 0) { 4499 dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP; 4500 dbg->arch.debugreg[7] = 0x0600; 4501 for (n = 0; n < nb_hw_breakpoint; n++) { 4502 dbg->arch.debugreg[n] = hw_breakpoint[n].addr; 4503 dbg->arch.debugreg[7] |= (2 << (n * 2)) | 4504 (type_code[hw_breakpoint[n].type] << (16 + n*4)) | 4505 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4)); 4506 } 4507 } 4508 } 4509 4510 static bool host_supports_vmx(void) 4511 { 4512 uint32_t ecx, unused; 4513 4514 host_cpuid(1, 0, &unused, &unused, &ecx, &unused); 4515 return ecx & CPUID_EXT_VMX; 4516 } 4517 4518 #define VMX_INVALID_GUEST_STATE 0x80000021 4519 4520 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) 4521 { 4522 X86CPU *cpu = X86_CPU(cs); 4523 uint64_t code; 4524 int ret; 4525 4526 switch (run->exit_reason) { 4527 case KVM_EXIT_HLT: 4528 DPRINTF("handle_hlt\n"); 4529 qemu_mutex_lock_iothread(); 4530 ret = kvm_handle_halt(cpu); 4531 qemu_mutex_unlock_iothread(); 4532 break; 4533 case KVM_EXIT_SET_TPR: 4534 ret = 0; 4535 break; 4536 case KVM_EXIT_TPR_ACCESS: 4537 qemu_mutex_lock_iothread(); 4538 ret = kvm_handle_tpr_access(cpu); 4539 qemu_mutex_unlock_iothread(); 4540 break; 4541 case KVM_EXIT_FAIL_ENTRY: 4542 code = run->fail_entry.hardware_entry_failure_reason; 4543 fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n", 4544 code); 4545 if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) { 4546 fprintf(stderr, 4547 "\nIf you're running a guest on an Intel machine without " 4548 "unrestricted mode\n" 4549 "support, the failure can be most likely due to the guest " 4550 "entering an invalid\n" 4551 "state for Intel VT. For example, the guest maybe running " 4552 "in big real mode\n" 4553 "which is not supported on less recent Intel processors." 4554 "\n\n"); 4555 } 4556 ret = -1; 4557 break; 4558 case KVM_EXIT_EXCEPTION: 4559 fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n", 4560 run->ex.exception, run->ex.error_code); 4561 ret = -1; 4562 break; 4563 case KVM_EXIT_DEBUG: 4564 DPRINTF("kvm_exit_debug\n"); 4565 qemu_mutex_lock_iothread(); 4566 ret = kvm_handle_debug(cpu, &run->debug.arch); 4567 qemu_mutex_unlock_iothread(); 4568 break; 4569 case KVM_EXIT_HYPERV: 4570 ret = kvm_hv_handle_exit(cpu, &run->hyperv); 4571 break; 4572 case KVM_EXIT_IOAPIC_EOI: 4573 ioapic_eoi_broadcast(run->eoi.vector); 4574 ret = 0; 4575 break; 4576 default: 4577 fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); 4578 ret = -1; 4579 break; 4580 } 4581 4582 return ret; 4583 } 4584 4585 bool kvm_arch_stop_on_emulation_error(CPUState *cs) 4586 { 4587 X86CPU *cpu = X86_CPU(cs); 4588 CPUX86State *env = &cpu->env; 4589 4590 kvm_cpu_synchronize_state(cs); 4591 return !(env->cr[0] & CR0_PE_MASK) || 4592 ((env->segs[R_CS].selector & 3) != 3); 4593 } 4594 4595 void kvm_arch_init_irq_routing(KVMState *s) 4596 { 4597 /* We know at this point that we're using the in-kernel 4598 * irqchip, so we can use irqfds, and on x86 we know 4599 * we can use msi via irqfd and GSI routing. 4600 */ 4601 kvm_msi_via_irqfd_allowed = true; 4602 kvm_gsi_routing_allowed = true; 4603 4604 if (kvm_irqchip_is_split()) { 4605 int i; 4606 4607 /* If the ioapic is in QEMU and the lapics are in KVM, reserve 4608 MSI routes for signaling interrupts to the local apics. */ 4609 for (i = 0; i < IOAPIC_NUM_PINS; i++) { 4610 if (kvm_irqchip_add_msi_route(s, 0, NULL) < 0) { 4611 error_report("Could not enable split IRQ mode."); 4612 exit(1); 4613 } 4614 } 4615 } 4616 } 4617 4618 int kvm_arch_irqchip_create(KVMState *s) 4619 { 4620 int ret; 4621 if (kvm_kernel_irqchip_split()) { 4622 ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24); 4623 if (ret) { 4624 error_report("Could not enable split irqchip mode: %s", 4625 strerror(-ret)); 4626 exit(1); 4627 } else { 4628 DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n"); 4629 kvm_split_irqchip = true; 4630 return 1; 4631 } 4632 } else { 4633 return 0; 4634 } 4635 } 4636 4637 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address) 4638 { 4639 CPUX86State *env; 4640 uint64_t ext_id; 4641 4642 if (!first_cpu) { 4643 return address; 4644 } 4645 env = &X86_CPU(first_cpu)->env; 4646 if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) { 4647 return address; 4648 } 4649 4650 /* 4651 * If the remappable format bit is set, or the upper bits are 4652 * already set in address_hi, or the low extended bits aren't 4653 * there anyway, do nothing. 4654 */ 4655 ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT); 4656 if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) { 4657 return address; 4658 } 4659 4660 address &= ~ext_id; 4661 address |= ext_id << 35; 4662 return address; 4663 } 4664 4665 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, 4666 uint64_t address, uint32_t data, PCIDevice *dev) 4667 { 4668 X86IOMMUState *iommu = x86_iommu_get_default(); 4669 4670 if (iommu) { 4671 X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu); 4672 4673 if (class->int_remap) { 4674 int ret; 4675 MSIMessage src, dst; 4676 4677 src.address = route->u.msi.address_hi; 4678 src.address <<= VTD_MSI_ADDR_HI_SHIFT; 4679 src.address |= route->u.msi.address_lo; 4680 src.data = route->u.msi.data; 4681 4682 ret = class->int_remap(iommu, &src, &dst, dev ? \ 4683 pci_requester_id(dev) : \ 4684 X86_IOMMU_SID_INVALID); 4685 if (ret) { 4686 trace_kvm_x86_fixup_msi_error(route->gsi); 4687 return 1; 4688 } 4689 4690 /* 4691 * Handled untranslated compatibilty format interrupt with 4692 * extended destination ID in the low bits 11-5. */ 4693 dst.address = kvm_swizzle_msi_ext_dest_id(dst.address); 4694 4695 route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT; 4696 route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK; 4697 route->u.msi.data = dst.data; 4698 return 0; 4699 } 4700 } 4701 4702 address = kvm_swizzle_msi_ext_dest_id(address); 4703 route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT; 4704 route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK; 4705 return 0; 4706 } 4707 4708 typedef struct MSIRouteEntry MSIRouteEntry; 4709 4710 struct MSIRouteEntry { 4711 PCIDevice *dev; /* Device pointer */ 4712 int vector; /* MSI/MSIX vector index */ 4713 int virq; /* Virtual IRQ index */ 4714 QLIST_ENTRY(MSIRouteEntry) list; 4715 }; 4716 4717 /* List of used GSI routes */ 4718 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \ 4719 QLIST_HEAD_INITIALIZER(msi_route_list); 4720 4721 static void kvm_update_msi_routes_all(void *private, bool global, 4722 uint32_t index, uint32_t mask) 4723 { 4724 int cnt = 0, vector; 4725 MSIRouteEntry *entry; 4726 MSIMessage msg; 4727 PCIDevice *dev; 4728 4729 /* TODO: explicit route update */ 4730 QLIST_FOREACH(entry, &msi_route_list, list) { 4731 cnt++; 4732 vector = entry->vector; 4733 dev = entry->dev; 4734 if (msix_enabled(dev) && !msix_is_masked(dev, vector)) { 4735 msg = msix_get_message(dev, vector); 4736 } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) { 4737 msg = msi_get_message(dev, vector); 4738 } else { 4739 /* 4740 * Either MSI/MSIX is disabled for the device, or the 4741 * specific message was masked out. Skip this one. 4742 */ 4743 continue; 4744 } 4745 kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev); 4746 } 4747 kvm_irqchip_commit_routes(kvm_state); 4748 trace_kvm_x86_update_msi_routes(cnt); 4749 } 4750 4751 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route, 4752 int vector, PCIDevice *dev) 4753 { 4754 static bool notify_list_inited = false; 4755 MSIRouteEntry *entry; 4756 4757 if (!dev) { 4758 /* These are (possibly) IOAPIC routes only used for split 4759 * kernel irqchip mode, while what we are housekeeping are 4760 * PCI devices only. */ 4761 return 0; 4762 } 4763 4764 entry = g_new0(MSIRouteEntry, 1); 4765 entry->dev = dev; 4766 entry->vector = vector; 4767 entry->virq = route->gsi; 4768 QLIST_INSERT_HEAD(&msi_route_list, entry, list); 4769 4770 trace_kvm_x86_add_msi_route(route->gsi); 4771 4772 if (!notify_list_inited) { 4773 /* For the first time we do add route, add ourselves into 4774 * IOMMU's IEC notify list if needed. */ 4775 X86IOMMUState *iommu = x86_iommu_get_default(); 4776 if (iommu) { 4777 x86_iommu_iec_register_notifier(iommu, 4778 kvm_update_msi_routes_all, 4779 NULL); 4780 } 4781 notify_list_inited = true; 4782 } 4783 return 0; 4784 } 4785 4786 int kvm_arch_release_virq_post(int virq) 4787 { 4788 MSIRouteEntry *entry, *next; 4789 QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) { 4790 if (entry->virq == virq) { 4791 trace_kvm_x86_remove_msi_route(virq); 4792 QLIST_REMOVE(entry, list); 4793 g_free(entry); 4794 break; 4795 } 4796 } 4797 return 0; 4798 } 4799 4800 int kvm_arch_msi_data_to_gsi(uint32_t data) 4801 { 4802 abort(); 4803 } 4804 4805 bool kvm_has_waitpkg(void) 4806 { 4807 return has_msr_umwait; 4808 } 4809