xref: /openbmc/qemu/target/i386/kvm/kvm.c (revision 3dba0a33)
1 /*
2  * QEMU KVM support
3  *
4  * Copyright (C) 2006-2008 Qumranet Technologies
5  * Copyright IBM, Corp. 2008
6  *
7  * Authors:
8  *  Anthony Liguori   <aliguori@us.ibm.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  *
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qapi/qapi-events-run-state.h"
17 #include "qapi/error.h"
18 #include <sys/ioctl.h>
19 #include <sys/utsname.h>
20 #include <sys/syscall.h>
21 
22 #include <linux/kvm.h>
23 #include "standard-headers/asm-x86/kvm_para.h"
24 
25 #include "cpu.h"
26 #include "host-cpu.h"
27 #include "sysemu/sysemu.h"
28 #include "sysemu/hw_accel.h"
29 #include "sysemu/kvm_int.h"
30 #include "sysemu/runstate.h"
31 #include "kvm_i386.h"
32 #include "sev.h"
33 #include "hyperv.h"
34 #include "hyperv-proto.h"
35 
36 #include "exec/gdbstub.h"
37 #include "qemu/host-utils.h"
38 #include "qemu/main-loop.h"
39 #include "qemu/config-file.h"
40 #include "qemu/error-report.h"
41 #include "qemu/memalign.h"
42 #include "hw/i386/x86.h"
43 #include "hw/i386/apic.h"
44 #include "hw/i386/apic_internal.h"
45 #include "hw/i386/apic-msidef.h"
46 #include "hw/i386/intel_iommu.h"
47 #include "hw/i386/x86-iommu.h"
48 #include "hw/i386/e820_memory_layout.h"
49 
50 #include "hw/pci/pci.h"
51 #include "hw/pci/msi.h"
52 #include "hw/pci/msix.h"
53 #include "migration/blocker.h"
54 #include "exec/memattrs.h"
55 #include "trace.h"
56 
57 #include CONFIG_DEVICES
58 
59 //#define DEBUG_KVM
60 
61 #ifdef DEBUG_KVM
62 #define DPRINTF(fmt, ...) \
63     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
64 #else
65 #define DPRINTF(fmt, ...) \
66     do { } while (0)
67 #endif
68 
69 /* From arch/x86/kvm/lapic.h */
70 #define KVM_APIC_BUS_CYCLE_NS       1
71 #define KVM_APIC_BUS_FREQUENCY      (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
72 
73 #define MSR_KVM_WALL_CLOCK  0x11
74 #define MSR_KVM_SYSTEM_TIME 0x12
75 
76 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
77  * 255 kvm_msr_entry structs */
78 #define MSR_BUF_SIZE 4096
79 
80 static void kvm_init_msrs(X86CPU *cpu);
81 
82 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
83     KVM_CAP_INFO(SET_TSS_ADDR),
84     KVM_CAP_INFO(EXT_CPUID),
85     KVM_CAP_INFO(MP_STATE),
86     KVM_CAP_LAST_INFO
87 };
88 
89 static bool has_msr_star;
90 static bool has_msr_hsave_pa;
91 static bool has_msr_tsc_aux;
92 static bool has_msr_tsc_adjust;
93 static bool has_msr_tsc_deadline;
94 static bool has_msr_feature_control;
95 static bool has_msr_misc_enable;
96 static bool has_msr_smbase;
97 static bool has_msr_bndcfgs;
98 static int lm_capable_kernel;
99 static bool has_msr_hv_hypercall;
100 static bool has_msr_hv_crash;
101 static bool has_msr_hv_reset;
102 static bool has_msr_hv_vpindex;
103 static bool hv_vpindex_settable;
104 static bool has_msr_hv_runtime;
105 static bool has_msr_hv_synic;
106 static bool has_msr_hv_stimer;
107 static bool has_msr_hv_frequencies;
108 static bool has_msr_hv_reenlightenment;
109 static bool has_msr_hv_syndbg_options;
110 static bool has_msr_xss;
111 static bool has_msr_umwait;
112 static bool has_msr_spec_ctrl;
113 static bool has_tsc_scale_msr;
114 static bool has_msr_tsx_ctrl;
115 static bool has_msr_virt_ssbd;
116 static bool has_msr_smi_count;
117 static bool has_msr_arch_capabs;
118 static bool has_msr_core_capabs;
119 static bool has_msr_vmx_vmfunc;
120 static bool has_msr_ucode_rev;
121 static bool has_msr_vmx_procbased_ctls2;
122 static bool has_msr_perf_capabs;
123 static bool has_msr_pkrs;
124 
125 static uint32_t has_architectural_pmu_version;
126 static uint32_t num_architectural_pmu_gp_counters;
127 static uint32_t num_architectural_pmu_fixed_counters;
128 
129 static int has_xsave;
130 static int has_xsave2;
131 static int has_xcrs;
132 static int has_pit_state2;
133 static int has_sregs2;
134 static int has_exception_payload;
135 static int has_triple_fault_event;
136 
137 static bool has_msr_mcg_ext_ctl;
138 
139 static struct kvm_cpuid2 *cpuid_cache;
140 static struct kvm_cpuid2 *hv_cpuid_cache;
141 static struct kvm_msr_list *kvm_feature_msrs;
142 
143 #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */
144 static RateLimit bus_lock_ratelimit_ctrl;
145 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value);
146 
147 int kvm_has_pit_state2(void)
148 {
149     return has_pit_state2;
150 }
151 
152 bool kvm_has_smm(void)
153 {
154     return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM);
155 }
156 
157 bool kvm_has_adjust_clock_stable(void)
158 {
159     int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
160 
161     return (ret & KVM_CLOCK_TSC_STABLE);
162 }
163 
164 bool kvm_has_adjust_clock(void)
165 {
166     return kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
167 }
168 
169 bool kvm_has_exception_payload(void)
170 {
171     return has_exception_payload;
172 }
173 
174 static bool kvm_x2apic_api_set_flags(uint64_t flags)
175 {
176     KVMState *s = KVM_STATE(current_accel());
177 
178     return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
179 }
180 
181 #define MEMORIZE(fn, _result) \
182     ({ \
183         static bool _memorized; \
184         \
185         if (_memorized) { \
186             return _result; \
187         } \
188         _memorized = true; \
189         _result = fn; \
190     })
191 
192 static bool has_x2apic_api;
193 
194 bool kvm_has_x2apic_api(void)
195 {
196     return has_x2apic_api;
197 }
198 
199 bool kvm_enable_x2apic(void)
200 {
201     return MEMORIZE(
202              kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
203                                       KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
204              has_x2apic_api);
205 }
206 
207 bool kvm_hv_vpindex_settable(void)
208 {
209     return hv_vpindex_settable;
210 }
211 
212 static int kvm_get_tsc(CPUState *cs)
213 {
214     X86CPU *cpu = X86_CPU(cs);
215     CPUX86State *env = &cpu->env;
216     uint64_t value;
217     int ret;
218 
219     if (env->tsc_valid) {
220         return 0;
221     }
222 
223     env->tsc_valid = !runstate_is_running();
224 
225     ret = kvm_get_one_msr(cpu, MSR_IA32_TSC, &value);
226     if (ret < 0) {
227         return ret;
228     }
229 
230     env->tsc = value;
231     return 0;
232 }
233 
234 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
235 {
236     kvm_get_tsc(cpu);
237 }
238 
239 void kvm_synchronize_all_tsc(void)
240 {
241     CPUState *cpu;
242 
243     if (kvm_enabled()) {
244         CPU_FOREACH(cpu) {
245             run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
246         }
247     }
248 }
249 
250 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
251 {
252     struct kvm_cpuid2 *cpuid;
253     int r, size;
254 
255     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
256     cpuid = g_malloc0(size);
257     cpuid->nent = max;
258     r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
259     if (r == 0 && cpuid->nent >= max) {
260         r = -E2BIG;
261     }
262     if (r < 0) {
263         if (r == -E2BIG) {
264             g_free(cpuid);
265             return NULL;
266         } else {
267             fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
268                     strerror(-r));
269             exit(1);
270         }
271     }
272     return cpuid;
273 }
274 
275 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
276  * for all entries.
277  */
278 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
279 {
280     struct kvm_cpuid2 *cpuid;
281     int max = 1;
282 
283     if (cpuid_cache != NULL) {
284         return cpuid_cache;
285     }
286     while ((cpuid = try_get_cpuid(s, max)) == NULL) {
287         max *= 2;
288     }
289     cpuid_cache = cpuid;
290     return cpuid;
291 }
292 
293 static bool host_tsx_broken(void)
294 {
295     int family, model, stepping;\
296     char vendor[CPUID_VENDOR_SZ + 1];
297 
298     host_cpu_vendor_fms(vendor, &family, &model, &stepping);
299 
300     /* Check if we are running on a Haswell host known to have broken TSX */
301     return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
302            (family == 6) &&
303            ((model == 63 && stepping < 4) ||
304             model == 60 || model == 69 || model == 70);
305 }
306 
307 /* Returns the value for a specific register on the cpuid entry
308  */
309 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
310 {
311     uint32_t ret = 0;
312     switch (reg) {
313     case R_EAX:
314         ret = entry->eax;
315         break;
316     case R_EBX:
317         ret = entry->ebx;
318         break;
319     case R_ECX:
320         ret = entry->ecx;
321         break;
322     case R_EDX:
323         ret = entry->edx;
324         break;
325     }
326     return ret;
327 }
328 
329 /* Find matching entry for function/index on kvm_cpuid2 struct
330  */
331 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
332                                                  uint32_t function,
333                                                  uint32_t index)
334 {
335     int i;
336     for (i = 0; i < cpuid->nent; ++i) {
337         if (cpuid->entries[i].function == function &&
338             cpuid->entries[i].index == index) {
339             return &cpuid->entries[i];
340         }
341     }
342     /* not found: */
343     return NULL;
344 }
345 
346 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
347                                       uint32_t index, int reg)
348 {
349     struct kvm_cpuid2 *cpuid;
350     uint32_t ret = 0;
351     uint32_t cpuid_1_edx;
352     uint64_t bitmask;
353 
354     cpuid = get_supported_cpuid(s);
355 
356     struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
357     if (entry) {
358         ret = cpuid_entry_get_reg(entry, reg);
359     }
360 
361     /* Fixups for the data returned by KVM, below */
362 
363     if (function == 1 && reg == R_EDX) {
364         /* KVM before 2.6.30 misreports the following features */
365         ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
366     } else if (function == 1 && reg == R_ECX) {
367         /* We can set the hypervisor flag, even if KVM does not return it on
368          * GET_SUPPORTED_CPUID
369          */
370         ret |= CPUID_EXT_HYPERVISOR;
371         /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
372          * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
373          * and the irqchip is in the kernel.
374          */
375         if (kvm_irqchip_in_kernel() &&
376                 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
377             ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
378         }
379 
380         /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
381          * without the in-kernel irqchip
382          */
383         if (!kvm_irqchip_in_kernel()) {
384             ret &= ~CPUID_EXT_X2APIC;
385         }
386 
387         if (enable_cpu_pm) {
388             int disable_exits = kvm_check_extension(s,
389                                                     KVM_CAP_X86_DISABLE_EXITS);
390 
391             if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
392                 ret |= CPUID_EXT_MONITOR;
393             }
394         }
395     } else if (function == 6 && reg == R_EAX) {
396         ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
397     } else if (function == 7 && index == 0 && reg == R_EBX) {
398         if (host_tsx_broken()) {
399             ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
400         }
401     } else if (function == 7 && index == 0 && reg == R_EDX) {
402         /*
403          * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
404          * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
405          * returned by KVM_GET_MSR_INDEX_LIST.
406          */
407         if (!has_msr_arch_capabs) {
408             ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
409         }
410     } else if (function == 0xd && index == 0 &&
411                (reg == R_EAX || reg == R_EDX)) {
412         /*
413          * The value returned by KVM_GET_SUPPORTED_CPUID does not include
414          * features that still have to be enabled with the arch_prctl
415          * system call.  QEMU needs the full value, which is retrieved
416          * with KVM_GET_DEVICE_ATTR.
417          */
418         struct kvm_device_attr attr = {
419             .group = 0,
420             .attr = KVM_X86_XCOMP_GUEST_SUPP,
421             .addr = (unsigned long) &bitmask
422         };
423 
424         bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES);
425         if (!sys_attr) {
426             return ret;
427         }
428 
429         int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr);
430         if (rc < 0) {
431             if (rc != -ENXIO) {
432                 warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) "
433                             "error: %d", rc);
434             }
435             return ret;
436         }
437         ret = (reg == R_EAX) ? bitmask : bitmask >> 32;
438     } else if (function == 0x80000001 && reg == R_ECX) {
439         /*
440          * It's safe to enable TOPOEXT even if it's not returned by
441          * GET_SUPPORTED_CPUID.  Unconditionally enabling TOPOEXT here allows
442          * us to keep CPU models including TOPOEXT runnable on older kernels.
443          */
444         ret |= CPUID_EXT3_TOPOEXT;
445     } else if (function == 0x80000001 && reg == R_EDX) {
446         /* On Intel, kvm returns cpuid according to the Intel spec,
447          * so add missing bits according to the AMD spec:
448          */
449         cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
450         ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
451     } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
452         /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
453          * be enabled without the in-kernel irqchip
454          */
455         if (!kvm_irqchip_in_kernel()) {
456             ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
457         }
458         if (kvm_irqchip_is_split()) {
459             ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
460         }
461     } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
462         ret |= 1U << KVM_HINTS_REALTIME;
463     }
464 
465     return ret;
466 }
467 
468 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
469 {
470     struct {
471         struct kvm_msrs info;
472         struct kvm_msr_entry entries[1];
473     } msr_data = {};
474     uint64_t value;
475     uint32_t ret, can_be_one, must_be_one;
476 
477     if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
478         return 0;
479     }
480 
481     /* Check if requested MSR is supported feature MSR */
482     int i;
483     for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
484         if (kvm_feature_msrs->indices[i] == index) {
485             break;
486         }
487     if (i == kvm_feature_msrs->nmsrs) {
488         return 0; /* if the feature MSR is not supported, simply return 0 */
489     }
490 
491     msr_data.info.nmsrs = 1;
492     msr_data.entries[0].index = index;
493 
494     ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
495     if (ret != 1) {
496         error_report("KVM get MSR (index=0x%x) feature failed, %s",
497             index, strerror(-ret));
498         exit(1);
499     }
500 
501     value = msr_data.entries[0].data;
502     switch (index) {
503     case MSR_IA32_VMX_PROCBASED_CTLS2:
504         if (!has_msr_vmx_procbased_ctls2) {
505             /* KVM forgot to add these bits for some time, do this ourselves. */
506             if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) &
507                 CPUID_XSAVE_XSAVES) {
508                 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
509             }
510             if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) &
511                 CPUID_EXT_RDRAND) {
512                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
513             }
514             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
515                 CPUID_7_0_EBX_INVPCID) {
516                 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
517             }
518             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
519                 CPUID_7_0_EBX_RDSEED) {
520                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
521             }
522             if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) &
523                 CPUID_EXT2_RDTSCP) {
524                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
525             }
526         }
527         /* fall through */
528     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
529     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
530     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
531     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
532         /*
533          * Return true for bits that can be one, but do not have to be one.
534          * The SDM tells us which bits could have a "must be one" setting,
535          * so we can do the opposite transformation in make_vmx_msr_value.
536          */
537         must_be_one = (uint32_t)value;
538         can_be_one = (uint32_t)(value >> 32);
539         return can_be_one & ~must_be_one;
540 
541     default:
542         return value;
543     }
544 }
545 
546 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
547                                      int *max_banks)
548 {
549     int r;
550 
551     r = kvm_check_extension(s, KVM_CAP_MCE);
552     if (r > 0) {
553         *max_banks = r;
554         return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
555     }
556     return -ENOSYS;
557 }
558 
559 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
560 {
561     CPUState *cs = CPU(cpu);
562     CPUX86State *env = &cpu->env;
563     uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
564                       MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
565     uint64_t mcg_status = MCG_STATUS_MCIP;
566     int flags = 0;
567 
568     if (code == BUS_MCEERR_AR) {
569         status |= MCI_STATUS_AR | 0x134;
570         mcg_status |= MCG_STATUS_RIPV | MCG_STATUS_EIPV;
571     } else {
572         status |= 0xc0;
573         mcg_status |= MCG_STATUS_RIPV;
574     }
575 
576     flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
577     /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
578      * guest kernel back into env->mcg_ext_ctl.
579      */
580     cpu_synchronize_state(cs);
581     if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
582         mcg_status |= MCG_STATUS_LMCE;
583         flags = 0;
584     }
585 
586     cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
587                        (MCM_ADDR_PHYS << 6) | 0xc, flags);
588 }
589 
590 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar)
591 {
592     MemoryFailureFlags mff = {.action_required = ar, .recursive = false};
593 
594     qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action,
595                                    &mff);
596 }
597 
598 static void hardware_memory_error(void *host_addr)
599 {
600     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true);
601     error_report("QEMU got Hardware memory error at addr %p", host_addr);
602     exit(1);
603 }
604 
605 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
606 {
607     X86CPU *cpu = X86_CPU(c);
608     CPUX86State *env = &cpu->env;
609     ram_addr_t ram_addr;
610     hwaddr paddr;
611 
612     /* If we get an action required MCE, it has been injected by KVM
613      * while the VM was running.  An action optional MCE instead should
614      * be coming from the main thread, which qemu_init_sigbus identifies
615      * as the "early kill" thread.
616      */
617     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
618 
619     if ((env->mcg_cap & MCG_SER_P) && addr) {
620         ram_addr = qemu_ram_addr_from_host(addr);
621         if (ram_addr != RAM_ADDR_INVALID &&
622             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
623             kvm_hwpoison_page_add(ram_addr);
624             kvm_mce_inject(cpu, paddr, code);
625 
626             /*
627              * Use different logging severity based on error type.
628              * If there is additional MCE reporting on the hypervisor, QEMU VA
629              * could be another source to identify the PA and MCE details.
630              */
631             if (code == BUS_MCEERR_AR) {
632                 error_report("Guest MCE Memory Error at QEMU addr %p and "
633                     "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
634                     addr, paddr, "BUS_MCEERR_AR");
635             } else {
636                  warn_report("Guest MCE Memory Error at QEMU addr %p and "
637                      "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
638                      addr, paddr, "BUS_MCEERR_AO");
639             }
640 
641             return;
642         }
643 
644         if (code == BUS_MCEERR_AO) {
645             warn_report("Hardware memory error at addr %p of type %s "
646                 "for memory used by QEMU itself instead of guest system!",
647                  addr, "BUS_MCEERR_AO");
648         }
649     }
650 
651     if (code == BUS_MCEERR_AR) {
652         hardware_memory_error(addr);
653     }
654 
655     /* Hope we are lucky for AO MCE, just notify a event */
656     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false);
657 }
658 
659 static void kvm_reset_exception(CPUX86State *env)
660 {
661     env->exception_nr = -1;
662     env->exception_pending = 0;
663     env->exception_injected = 0;
664     env->exception_has_payload = false;
665     env->exception_payload = 0;
666 }
667 
668 static void kvm_queue_exception(CPUX86State *env,
669                                 int32_t exception_nr,
670                                 uint8_t exception_has_payload,
671                                 uint64_t exception_payload)
672 {
673     assert(env->exception_nr == -1);
674     assert(!env->exception_pending);
675     assert(!env->exception_injected);
676     assert(!env->exception_has_payload);
677 
678     env->exception_nr = exception_nr;
679 
680     if (has_exception_payload) {
681         env->exception_pending = 1;
682 
683         env->exception_has_payload = exception_has_payload;
684         env->exception_payload = exception_payload;
685     } else {
686         env->exception_injected = 1;
687 
688         if (exception_nr == EXCP01_DB) {
689             assert(exception_has_payload);
690             env->dr[6] = exception_payload;
691         } else if (exception_nr == EXCP0E_PAGE) {
692             assert(exception_has_payload);
693             env->cr[2] = exception_payload;
694         } else {
695             assert(!exception_has_payload);
696         }
697     }
698 }
699 
700 static int kvm_inject_mce_oldstyle(X86CPU *cpu)
701 {
702     CPUX86State *env = &cpu->env;
703 
704     if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
705         unsigned int bank, bank_num = env->mcg_cap & 0xff;
706         struct kvm_x86_mce mce;
707 
708         kvm_reset_exception(env);
709 
710         /*
711          * There must be at least one bank in use if an MCE is pending.
712          * Find it and use its values for the event injection.
713          */
714         for (bank = 0; bank < bank_num; bank++) {
715             if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
716                 break;
717             }
718         }
719         assert(bank < bank_num);
720 
721         mce.bank = bank;
722         mce.status = env->mce_banks[bank * 4 + 1];
723         mce.mcg_status = env->mcg_status;
724         mce.addr = env->mce_banks[bank * 4 + 2];
725         mce.misc = env->mce_banks[bank * 4 + 3];
726 
727         return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
728     }
729     return 0;
730 }
731 
732 static void cpu_update_state(void *opaque, bool running, RunState state)
733 {
734     CPUX86State *env = opaque;
735 
736     if (running) {
737         env->tsc_valid = false;
738     }
739 }
740 
741 unsigned long kvm_arch_vcpu_id(CPUState *cs)
742 {
743     X86CPU *cpu = X86_CPU(cs);
744     return cpu->apic_id;
745 }
746 
747 #ifndef KVM_CPUID_SIGNATURE_NEXT
748 #define KVM_CPUID_SIGNATURE_NEXT                0x40000100
749 #endif
750 
751 static bool hyperv_enabled(X86CPU *cpu)
752 {
753     return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 &&
754         ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) ||
755          cpu->hyperv_features || cpu->hyperv_passthrough);
756 }
757 
758 /*
759  * Check whether target_freq is within conservative
760  * ntp correctable bounds (250ppm) of freq
761  */
762 static inline bool freq_within_bounds(int freq, int target_freq)
763 {
764         int max_freq = freq + (freq * 250 / 1000000);
765         int min_freq = freq - (freq * 250 / 1000000);
766 
767         if (target_freq >= min_freq && target_freq <= max_freq) {
768                 return true;
769         }
770 
771         return false;
772 }
773 
774 static int kvm_arch_set_tsc_khz(CPUState *cs)
775 {
776     X86CPU *cpu = X86_CPU(cs);
777     CPUX86State *env = &cpu->env;
778     int r, cur_freq;
779     bool set_ioctl = false;
780 
781     if (!env->tsc_khz) {
782         return 0;
783     }
784 
785     cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
786                kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
787 
788     /*
789      * If TSC scaling is supported, attempt to set TSC frequency.
790      */
791     if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
792         set_ioctl = true;
793     }
794 
795     /*
796      * If desired TSC frequency is within bounds of NTP correction,
797      * attempt to set TSC frequency.
798      */
799     if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) {
800         set_ioctl = true;
801     }
802 
803     r = set_ioctl ?
804         kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
805         -ENOTSUP;
806 
807     if (r < 0) {
808         /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
809          * TSC frequency doesn't match the one we want.
810          */
811         cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
812                    kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
813                    -ENOTSUP;
814         if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
815             warn_report("TSC frequency mismatch between "
816                         "VM (%" PRId64 " kHz) and host (%d kHz), "
817                         "and TSC scaling unavailable",
818                         env->tsc_khz, cur_freq);
819             return r;
820         }
821     }
822 
823     return 0;
824 }
825 
826 static bool tsc_is_stable_and_known(CPUX86State *env)
827 {
828     if (!env->tsc_khz) {
829         return false;
830     }
831     return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
832         || env->user_tsc_khz;
833 }
834 
835 #define DEFAULT_EVMCS_VERSION ((1 << 8) | 1)
836 
837 static struct {
838     const char *desc;
839     struct {
840         uint32_t func;
841         int reg;
842         uint32_t bits;
843     } flags[2];
844     uint64_t dependencies;
845 } kvm_hyperv_properties[] = {
846     [HYPERV_FEAT_RELAXED] = {
847         .desc = "relaxed timing (hv-relaxed)",
848         .flags = {
849             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
850              .bits = HV_RELAXED_TIMING_RECOMMENDED}
851         }
852     },
853     [HYPERV_FEAT_VAPIC] = {
854         .desc = "virtual APIC (hv-vapic)",
855         .flags = {
856             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
857              .bits = HV_APIC_ACCESS_AVAILABLE}
858         }
859     },
860     [HYPERV_FEAT_TIME] = {
861         .desc = "clocksources (hv-time)",
862         .flags = {
863             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
864              .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE}
865         }
866     },
867     [HYPERV_FEAT_CRASH] = {
868         .desc = "crash MSRs (hv-crash)",
869         .flags = {
870             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
871              .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
872         }
873     },
874     [HYPERV_FEAT_RESET] = {
875         .desc = "reset MSR (hv-reset)",
876         .flags = {
877             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
878              .bits = HV_RESET_AVAILABLE}
879         }
880     },
881     [HYPERV_FEAT_VPINDEX] = {
882         .desc = "VP_INDEX MSR (hv-vpindex)",
883         .flags = {
884             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
885              .bits = HV_VP_INDEX_AVAILABLE}
886         }
887     },
888     [HYPERV_FEAT_RUNTIME] = {
889         .desc = "VP_RUNTIME MSR (hv-runtime)",
890         .flags = {
891             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
892              .bits = HV_VP_RUNTIME_AVAILABLE}
893         }
894     },
895     [HYPERV_FEAT_SYNIC] = {
896         .desc = "synthetic interrupt controller (hv-synic)",
897         .flags = {
898             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
899              .bits = HV_SYNIC_AVAILABLE}
900         }
901     },
902     [HYPERV_FEAT_STIMER] = {
903         .desc = "synthetic timers (hv-stimer)",
904         .flags = {
905             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
906              .bits = HV_SYNTIMERS_AVAILABLE}
907         },
908         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
909     },
910     [HYPERV_FEAT_FREQUENCIES] = {
911         .desc = "frequency MSRs (hv-frequencies)",
912         .flags = {
913             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
914              .bits = HV_ACCESS_FREQUENCY_MSRS},
915             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
916              .bits = HV_FREQUENCY_MSRS_AVAILABLE}
917         }
918     },
919     [HYPERV_FEAT_REENLIGHTENMENT] = {
920         .desc = "reenlightenment MSRs (hv-reenlightenment)",
921         .flags = {
922             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
923              .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
924         }
925     },
926     [HYPERV_FEAT_TLBFLUSH] = {
927         .desc = "paravirtualized TLB flush (hv-tlbflush)",
928         .flags = {
929             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
930              .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
931              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
932         },
933         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
934     },
935     [HYPERV_FEAT_EVMCS] = {
936         .desc = "enlightened VMCS (hv-evmcs)",
937         .flags = {
938             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
939              .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
940         },
941         .dependencies = BIT(HYPERV_FEAT_VAPIC)
942     },
943     [HYPERV_FEAT_IPI] = {
944         .desc = "paravirtualized IPI (hv-ipi)",
945         .flags = {
946             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
947              .bits = HV_CLUSTER_IPI_RECOMMENDED |
948              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
949         },
950         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
951     },
952     [HYPERV_FEAT_STIMER_DIRECT] = {
953         .desc = "direct mode synthetic timers (hv-stimer-direct)",
954         .flags = {
955             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
956              .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
957         },
958         .dependencies = BIT(HYPERV_FEAT_STIMER)
959     },
960     [HYPERV_FEAT_AVIC] = {
961         .desc = "AVIC/APICv support (hv-avic/hv-apicv)",
962         .flags = {
963             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
964              .bits = HV_DEPRECATING_AEOI_RECOMMENDED}
965         }
966     },
967 #ifdef CONFIG_SYNDBG
968     [HYPERV_FEAT_SYNDBG] = {
969         .desc = "Enable synthetic kernel debugger channel (hv-syndbg)",
970         .flags = {
971             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
972              .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE}
973         },
974         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED)
975     },
976 #endif
977     [HYPERV_FEAT_MSR_BITMAP] = {
978         .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)",
979         .flags = {
980             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
981              .bits = HV_NESTED_MSR_BITMAP}
982         }
983     },
984     [HYPERV_FEAT_XMM_INPUT] = {
985         .desc = "XMM fast hypercall input (hv-xmm-input)",
986         .flags = {
987             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
988              .bits = HV_HYPERCALL_XMM_INPUT_AVAILABLE}
989         }
990     },
991     [HYPERV_FEAT_TLBFLUSH_EXT] = {
992         .desc = "Extended gva ranges for TLB flush hypercalls (hv-tlbflush-ext)",
993         .flags = {
994             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
995              .bits = HV_EXT_GVA_RANGES_FLUSH_AVAILABLE}
996         },
997         .dependencies = BIT(HYPERV_FEAT_TLBFLUSH)
998     },
999     [HYPERV_FEAT_TLBFLUSH_DIRECT] = {
1000         .desc = "direct TLB flush (hv-tlbflush-direct)",
1001         .flags = {
1002             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
1003              .bits = HV_NESTED_DIRECT_FLUSH}
1004         },
1005         .dependencies = BIT(HYPERV_FEAT_VAPIC)
1006     },
1007 };
1008 
1009 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max,
1010                                            bool do_sys_ioctl)
1011 {
1012     struct kvm_cpuid2 *cpuid;
1013     int r, size;
1014 
1015     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
1016     cpuid = g_malloc0(size);
1017     cpuid->nent = max;
1018 
1019     if (do_sys_ioctl) {
1020         r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1021     } else {
1022         r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1023     }
1024     if (r == 0 && cpuid->nent >= max) {
1025         r = -E2BIG;
1026     }
1027     if (r < 0) {
1028         if (r == -E2BIG) {
1029             g_free(cpuid);
1030             return NULL;
1031         } else {
1032             fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
1033                     strerror(-r));
1034             exit(1);
1035         }
1036     }
1037     return cpuid;
1038 }
1039 
1040 /*
1041  * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
1042  * for all entries.
1043  */
1044 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
1045 {
1046     struct kvm_cpuid2 *cpuid;
1047     /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000082 leaves */
1048     int max = 11;
1049     int i;
1050     bool do_sys_ioctl;
1051 
1052     do_sys_ioctl =
1053         kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0;
1054 
1055     /*
1056      * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is
1057      * unsupported, kvm_hyperv_expand_features() checks for that.
1058      */
1059     assert(do_sys_ioctl || cs->kvm_state);
1060 
1061     /*
1062      * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
1063      * -E2BIG, however, it doesn't report back the right size. Keep increasing
1064      * it and re-trying until we succeed.
1065      */
1066     while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) {
1067         max++;
1068     }
1069 
1070     /*
1071      * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before
1072      * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the
1073      * information early, just check for the capability and set the bit
1074      * manually.
1075      */
1076     if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state,
1077                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1078         for (i = 0; i < cpuid->nent; i++) {
1079             if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) {
1080                 cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1081             }
1082         }
1083     }
1084 
1085     return cpuid;
1086 }
1087 
1088 /*
1089  * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
1090  * leaves from KVM_CAP_HYPERV* and present MSRs data.
1091  */
1092 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
1093 {
1094     X86CPU *cpu = X86_CPU(cs);
1095     struct kvm_cpuid2 *cpuid;
1096     struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
1097 
1098     /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
1099     cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
1100     cpuid->nent = 2;
1101 
1102     /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
1103     entry_feat = &cpuid->entries[0];
1104     entry_feat->function = HV_CPUID_FEATURES;
1105 
1106     entry_recomm = &cpuid->entries[1];
1107     entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
1108     entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
1109 
1110     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
1111         entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
1112         entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
1113         entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1114         entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
1115         entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
1116     }
1117 
1118     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
1119         entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
1120         entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
1121     }
1122 
1123     if (has_msr_hv_frequencies) {
1124         entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
1125         entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
1126     }
1127 
1128     if (has_msr_hv_crash) {
1129         entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
1130     }
1131 
1132     if (has_msr_hv_reenlightenment) {
1133         entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
1134     }
1135 
1136     if (has_msr_hv_reset) {
1137         entry_feat->eax |= HV_RESET_AVAILABLE;
1138     }
1139 
1140     if (has_msr_hv_vpindex) {
1141         entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
1142     }
1143 
1144     if (has_msr_hv_runtime) {
1145         entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
1146     }
1147 
1148     if (has_msr_hv_synic) {
1149         unsigned int cap = cpu->hyperv_synic_kvm_only ?
1150             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1151 
1152         if (kvm_check_extension(cs->kvm_state, cap) > 0) {
1153             entry_feat->eax |= HV_SYNIC_AVAILABLE;
1154         }
1155     }
1156 
1157     if (has_msr_hv_stimer) {
1158         entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
1159     }
1160 
1161     if (has_msr_hv_syndbg_options) {
1162         entry_feat->edx |= HV_GUEST_DEBUGGING_AVAILABLE;
1163         entry_feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
1164         entry_feat->ebx |= HV_PARTITION_DEBUGGING_ALLOWED;
1165     }
1166 
1167     if (kvm_check_extension(cs->kvm_state,
1168                             KVM_CAP_HYPERV_TLBFLUSH) > 0) {
1169         entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
1170         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1171     }
1172 
1173     if (kvm_check_extension(cs->kvm_state,
1174                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1175         entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1176     }
1177 
1178     if (kvm_check_extension(cs->kvm_state,
1179                             KVM_CAP_HYPERV_SEND_IPI) > 0) {
1180         entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
1181         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1182     }
1183 
1184     return cpuid;
1185 }
1186 
1187 static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg)
1188 {
1189     struct kvm_cpuid_entry2 *entry;
1190     struct kvm_cpuid2 *cpuid;
1191 
1192     if (hv_cpuid_cache) {
1193         cpuid = hv_cpuid_cache;
1194     } else {
1195         if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
1196             cpuid = get_supported_hv_cpuid(cs);
1197         } else {
1198             /*
1199              * 'cs->kvm_state' may be NULL when Hyper-V features are expanded
1200              * before KVM context is created but this is only done when
1201              * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies
1202              * KVM_CAP_HYPERV_CPUID.
1203              */
1204             assert(cs->kvm_state);
1205 
1206             cpuid = get_supported_hv_cpuid_legacy(cs);
1207         }
1208         hv_cpuid_cache = cpuid;
1209     }
1210 
1211     if (!cpuid) {
1212         return 0;
1213     }
1214 
1215     entry = cpuid_find_entry(cpuid, func, 0);
1216     if (!entry) {
1217         return 0;
1218     }
1219 
1220     return cpuid_entry_get_reg(entry, reg);
1221 }
1222 
1223 static bool hyperv_feature_supported(CPUState *cs, int feature)
1224 {
1225     uint32_t func, bits;
1226     int i, reg;
1227 
1228     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
1229 
1230         func = kvm_hyperv_properties[feature].flags[i].func;
1231         reg = kvm_hyperv_properties[feature].flags[i].reg;
1232         bits = kvm_hyperv_properties[feature].flags[i].bits;
1233 
1234         if (!func) {
1235             continue;
1236         }
1237 
1238         if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) {
1239             return false;
1240         }
1241     }
1242 
1243     return true;
1244 }
1245 
1246 /* Checks that all feature dependencies are enabled */
1247 static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp)
1248 {
1249     uint64_t deps;
1250     int dep_feat;
1251 
1252     deps = kvm_hyperv_properties[feature].dependencies;
1253     while (deps) {
1254         dep_feat = ctz64(deps);
1255         if (!(hyperv_feat_enabled(cpu, dep_feat))) {
1256             error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1257                        kvm_hyperv_properties[feature].desc,
1258                        kvm_hyperv_properties[dep_feat].desc);
1259             return false;
1260         }
1261         deps &= ~(1ull << dep_feat);
1262     }
1263 
1264     return true;
1265 }
1266 
1267 static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg)
1268 {
1269     X86CPU *cpu = X86_CPU(cs);
1270     uint32_t r = 0;
1271     int i, j;
1272 
1273     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) {
1274         if (!hyperv_feat_enabled(cpu, i)) {
1275             continue;
1276         }
1277 
1278         for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) {
1279             if (kvm_hyperv_properties[i].flags[j].func != func) {
1280                 continue;
1281             }
1282             if (kvm_hyperv_properties[i].flags[j].reg != reg) {
1283                 continue;
1284             }
1285 
1286             r |= kvm_hyperv_properties[i].flags[j].bits;
1287         }
1288     }
1289 
1290     /* HV_CPUID_NESTED_FEATURES.EAX also encodes the supported eVMCS range */
1291     if (func == HV_CPUID_NESTED_FEATURES && reg == R_EAX) {
1292         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1293             r |= DEFAULT_EVMCS_VERSION;
1294         }
1295     }
1296 
1297     return r;
1298 }
1299 
1300 /*
1301  * Expand Hyper-V CPU features. In partucular, check that all the requested
1302  * features are supported by the host and the sanity of the configuration
1303  * (that all the required dependencies are included). Also, this takes care
1304  * of 'hv_passthrough' mode and fills the environment with all supported
1305  * Hyper-V features.
1306  */
1307 bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
1308 {
1309     CPUState *cs = CPU(cpu);
1310     Error *local_err = NULL;
1311     int feat;
1312 
1313     if (!hyperv_enabled(cpu))
1314         return true;
1315 
1316     /*
1317      * When kvm_hyperv_expand_features is called at CPU feature expansion
1318      * time per-CPU kvm_state is not available yet so we can only proceed
1319      * when KVM_CAP_SYS_HYPERV_CPUID is supported.
1320      */
1321     if (!cs->kvm_state &&
1322         !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID))
1323         return true;
1324 
1325     if (cpu->hyperv_passthrough) {
1326         cpu->hyperv_vendor_id[0] =
1327             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX);
1328         cpu->hyperv_vendor_id[1] =
1329             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX);
1330         cpu->hyperv_vendor_id[2] =
1331             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX);
1332         cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor,
1333                                        sizeof(cpu->hyperv_vendor_id) + 1);
1334         memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id,
1335                sizeof(cpu->hyperv_vendor_id));
1336         cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0;
1337 
1338         cpu->hyperv_interface_id[0] =
1339             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX);
1340         cpu->hyperv_interface_id[1] =
1341             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX);
1342         cpu->hyperv_interface_id[2] =
1343             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX);
1344         cpu->hyperv_interface_id[3] =
1345             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX);
1346 
1347         cpu->hyperv_ver_id_build =
1348             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX);
1349         cpu->hyperv_ver_id_major =
1350             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16;
1351         cpu->hyperv_ver_id_minor =
1352             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff;
1353         cpu->hyperv_ver_id_sp =
1354             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX);
1355         cpu->hyperv_ver_id_sb =
1356             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24;
1357         cpu->hyperv_ver_id_sn =
1358             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff;
1359 
1360         cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS,
1361                                             R_EAX);
1362         cpu->hyperv_limits[0] =
1363             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX);
1364         cpu->hyperv_limits[1] =
1365             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX);
1366         cpu->hyperv_limits[2] =
1367             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX);
1368 
1369         cpu->hyperv_spinlock_attempts =
1370             hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX);
1371 
1372         /*
1373          * Mark feature as enabled in 'cpu->hyperv_features' as
1374          * hv_build_cpuid_leaf() uses this info to build guest CPUIDs.
1375          */
1376         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1377             if (hyperv_feature_supported(cs, feat)) {
1378                 cpu->hyperv_features |= BIT(feat);
1379             }
1380         }
1381     } else {
1382         /* Check features availability and dependencies */
1383         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1384             /* If the feature was not requested skip it. */
1385             if (!hyperv_feat_enabled(cpu, feat)) {
1386                 continue;
1387             }
1388 
1389             /* Check if the feature is supported by KVM */
1390             if (!hyperv_feature_supported(cs, feat)) {
1391                 error_setg(errp, "Hyper-V %s is not supported by kernel",
1392                            kvm_hyperv_properties[feat].desc);
1393                 return false;
1394             }
1395 
1396             /* Check dependencies */
1397             if (!hv_feature_check_deps(cpu, feat, &local_err)) {
1398                 error_propagate(errp, local_err);
1399                 return false;
1400             }
1401         }
1402     }
1403 
1404     /* Additional dependencies not covered by kvm_hyperv_properties[] */
1405     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1406         !cpu->hyperv_synic_kvm_only &&
1407         !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
1408         error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1409                    kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
1410                    kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
1411         return false;
1412     }
1413 
1414     return true;
1415 }
1416 
1417 /*
1418  * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent.
1419  */
1420 static int hyperv_fill_cpuids(CPUState *cs,
1421                               struct kvm_cpuid_entry2 *cpuid_ent)
1422 {
1423     X86CPU *cpu = X86_CPU(cs);
1424     struct kvm_cpuid_entry2 *c;
1425     uint32_t signature[3];
1426     uint32_t cpuid_i = 0, max_cpuid_leaf = 0;
1427     uint32_t nested_eax =
1428         hv_build_cpuid_leaf(cs, HV_CPUID_NESTED_FEATURES, R_EAX);
1429 
1430     max_cpuid_leaf = nested_eax ? HV_CPUID_NESTED_FEATURES :
1431         HV_CPUID_IMPLEMENT_LIMITS;
1432 
1433     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
1434         max_cpuid_leaf =
1435             MAX(max_cpuid_leaf, HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
1436     }
1437 
1438     c = &cpuid_ent[cpuid_i++];
1439     c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
1440     c->eax = max_cpuid_leaf;
1441     c->ebx = cpu->hyperv_vendor_id[0];
1442     c->ecx = cpu->hyperv_vendor_id[1];
1443     c->edx = cpu->hyperv_vendor_id[2];
1444 
1445     c = &cpuid_ent[cpuid_i++];
1446     c->function = HV_CPUID_INTERFACE;
1447     c->eax = cpu->hyperv_interface_id[0];
1448     c->ebx = cpu->hyperv_interface_id[1];
1449     c->ecx = cpu->hyperv_interface_id[2];
1450     c->edx = cpu->hyperv_interface_id[3];
1451 
1452     c = &cpuid_ent[cpuid_i++];
1453     c->function = HV_CPUID_VERSION;
1454     c->eax = cpu->hyperv_ver_id_build;
1455     c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 |
1456         cpu->hyperv_ver_id_minor;
1457     c->ecx = cpu->hyperv_ver_id_sp;
1458     c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 |
1459         (cpu->hyperv_ver_id_sn & 0xffffff);
1460 
1461     c = &cpuid_ent[cpuid_i++];
1462     c->function = HV_CPUID_FEATURES;
1463     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX);
1464     c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX);
1465     c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX);
1466 
1467     /* Unconditionally required with any Hyper-V enlightenment */
1468     c->eax |= HV_HYPERCALL_AVAILABLE;
1469 
1470     /* SynIC and Vmbus devices require messages/signals hypercalls */
1471     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1472         !cpu->hyperv_synic_kvm_only) {
1473         c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS;
1474     }
1475 
1476 
1477     /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
1478     c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1479 
1480     c = &cpuid_ent[cpuid_i++];
1481     c->function = HV_CPUID_ENLIGHTMENT_INFO;
1482     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX);
1483     c->ebx = cpu->hyperv_spinlock_attempts;
1484 
1485     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
1486         !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) {
1487         c->eax |= HV_APIC_ACCESS_RECOMMENDED;
1488     }
1489 
1490     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) {
1491         c->eax |= HV_NO_NONARCH_CORESHARING;
1492     } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) {
1493         c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) &
1494             HV_NO_NONARCH_CORESHARING;
1495     }
1496 
1497     c = &cpuid_ent[cpuid_i++];
1498     c->function = HV_CPUID_IMPLEMENT_LIMITS;
1499     c->eax = cpu->hv_max_vps;
1500     c->ebx = cpu->hyperv_limits[0];
1501     c->ecx = cpu->hyperv_limits[1];
1502     c->edx = cpu->hyperv_limits[2];
1503 
1504     if (nested_eax) {
1505         uint32_t function;
1506 
1507         /* Create zeroed 0x40000006..0x40000009 leaves */
1508         for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
1509              function < HV_CPUID_NESTED_FEATURES; function++) {
1510             c = &cpuid_ent[cpuid_i++];
1511             c->function = function;
1512         }
1513 
1514         c = &cpuid_ent[cpuid_i++];
1515         c->function = HV_CPUID_NESTED_FEATURES;
1516         c->eax = nested_eax;
1517     }
1518 
1519     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
1520         c = &cpuid_ent[cpuid_i++];
1521         c->function = HV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS;
1522         c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
1523             HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
1524         memcpy(signature, "Microsoft VS", 12);
1525         c->eax = 0;
1526         c->ebx = signature[0];
1527         c->ecx = signature[1];
1528         c->edx = signature[2];
1529 
1530         c = &cpuid_ent[cpuid_i++];
1531         c->function = HV_CPUID_SYNDBG_INTERFACE;
1532         memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12);
1533         c->eax = signature[0];
1534         c->ebx = 0;
1535         c->ecx = 0;
1536         c->edx = 0;
1537 
1538         c = &cpuid_ent[cpuid_i++];
1539         c->function = HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES;
1540         c->eax = HV_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
1541         c->ebx = 0;
1542         c->ecx = 0;
1543         c->edx = 0;
1544     }
1545 
1546     return cpuid_i;
1547 }
1548 
1549 static Error *hv_passthrough_mig_blocker;
1550 static Error *hv_no_nonarch_cs_mig_blocker;
1551 
1552 /* Checks that the exposed eVMCS version range is supported by KVM */
1553 static bool evmcs_version_supported(uint16_t evmcs_version,
1554                                     uint16_t supported_evmcs_version)
1555 {
1556     uint8_t min_version = evmcs_version & 0xff;
1557     uint8_t max_version = evmcs_version >> 8;
1558     uint8_t min_supported_version = supported_evmcs_version & 0xff;
1559     uint8_t max_supported_version = supported_evmcs_version >> 8;
1560 
1561     return (min_version >= min_supported_version) &&
1562         (max_version <= max_supported_version);
1563 }
1564 
1565 static int hyperv_init_vcpu(X86CPU *cpu)
1566 {
1567     CPUState *cs = CPU(cpu);
1568     Error *local_err = NULL;
1569     int ret;
1570 
1571     if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
1572         error_setg(&hv_passthrough_mig_blocker,
1573                    "'hv-passthrough' CPU flag prevents migration, use explicit"
1574                    " set of hv-* flags instead");
1575         ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err);
1576         if (ret < 0) {
1577             error_report_err(local_err);
1578             return ret;
1579         }
1580     }
1581 
1582     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO &&
1583         hv_no_nonarch_cs_mig_blocker == NULL) {
1584         error_setg(&hv_no_nonarch_cs_mig_blocker,
1585                    "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration"
1586                    " use explicit 'hv-no-nonarch-coresharing=on' instead (but"
1587                    " make sure SMT is disabled and/or that vCPUs are properly"
1588                    " pinned)");
1589         ret = migrate_add_blocker(hv_no_nonarch_cs_mig_blocker, &local_err);
1590         if (ret < 0) {
1591             error_report_err(local_err);
1592             return ret;
1593         }
1594     }
1595 
1596     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
1597         /*
1598          * the kernel doesn't support setting vp_index; assert that its value
1599          * is in sync
1600          */
1601         uint64_t value;
1602 
1603         ret = kvm_get_one_msr(cpu, HV_X64_MSR_VP_INDEX, &value);
1604         if (ret < 0) {
1605             return ret;
1606         }
1607 
1608         if (value != hyperv_vp_index(CPU(cpu))) {
1609             error_report("kernel's vp_index != QEMU's vp_index");
1610             return -ENXIO;
1611         }
1612     }
1613 
1614     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1615         uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
1616             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1617         ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
1618         if (ret < 0) {
1619             error_report("failed to turn on HyperV SynIC in KVM: %s",
1620                          strerror(-ret));
1621             return ret;
1622         }
1623 
1624         if (!cpu->hyperv_synic_kvm_only) {
1625             ret = hyperv_x86_synic_add(cpu);
1626             if (ret < 0) {
1627                 error_report("failed to create HyperV SynIC: %s",
1628                              strerror(-ret));
1629                 return ret;
1630             }
1631         }
1632     }
1633 
1634     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1635         uint16_t evmcs_version = DEFAULT_EVMCS_VERSION;
1636         uint16_t supported_evmcs_version;
1637 
1638         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
1639                                   (uintptr_t)&supported_evmcs_version);
1640 
1641         /*
1642          * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs'
1643          * option sets. Note: we hardcode the maximum supported eVMCS version
1644          * to '1' as well so 'hv-evmcs' feature is migratable even when (and if)
1645          * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have
1646          * to be added.
1647          */
1648         if (ret < 0) {
1649             error_report("Hyper-V %s is not supported by kernel",
1650                          kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
1651             return ret;
1652         }
1653 
1654         if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) {
1655             error_report("eVMCS version range [%d..%d] is not supported by "
1656                          "kernel (supported: [%d..%d])", evmcs_version & 0xff,
1657                          evmcs_version >> 8, supported_evmcs_version & 0xff,
1658                          supported_evmcs_version >> 8);
1659             return -ENOTSUP;
1660         }
1661     }
1662 
1663     if (cpu->hyperv_enforce_cpuid) {
1664         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1);
1665         if (ret < 0) {
1666             error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s",
1667                          strerror(-ret));
1668             return ret;
1669         }
1670     }
1671 
1672     return 0;
1673 }
1674 
1675 static Error *invtsc_mig_blocker;
1676 
1677 #define KVM_MAX_CPUID_ENTRIES  100
1678 
1679 static void kvm_init_xsave(CPUX86State *env)
1680 {
1681     if (has_xsave2) {
1682         env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096);
1683     } else if (has_xsave) {
1684         env->xsave_buf_len = sizeof(struct kvm_xsave);
1685     } else {
1686         return;
1687     }
1688 
1689     env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
1690     memset(env->xsave_buf, 0, env->xsave_buf_len);
1691     /*
1692      * The allocated storage must be large enough for all of the
1693      * possible XSAVE state components.
1694      */
1695     assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <=
1696            env->xsave_buf_len);
1697 }
1698 
1699 static void kvm_init_nested_state(CPUX86State *env)
1700 {
1701     struct kvm_vmx_nested_state_hdr *vmx_hdr;
1702     uint32_t size;
1703 
1704     if (!env->nested_state) {
1705         return;
1706     }
1707 
1708     size = env->nested_state->size;
1709 
1710     memset(env->nested_state, 0, size);
1711     env->nested_state->size = size;
1712 
1713     if (cpu_has_vmx(env)) {
1714         env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
1715         vmx_hdr = &env->nested_state->hdr.vmx;
1716         vmx_hdr->vmxon_pa = -1ull;
1717         vmx_hdr->vmcs12_pa = -1ull;
1718     } else if (cpu_has_svm(env)) {
1719         env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
1720     }
1721 }
1722 
1723 int kvm_arch_init_vcpu(CPUState *cs)
1724 {
1725     struct {
1726         struct kvm_cpuid2 cpuid;
1727         struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
1728     } cpuid_data;
1729     /*
1730      * The kernel defines these structs with padding fields so there
1731      * should be no extra padding in our cpuid_data struct.
1732      */
1733     QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
1734                       sizeof(struct kvm_cpuid2) +
1735                       sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
1736 
1737     X86CPU *cpu = X86_CPU(cs);
1738     CPUX86State *env = &cpu->env;
1739     uint32_t limit, i, j, cpuid_i;
1740     uint32_t unused;
1741     struct kvm_cpuid_entry2 *c;
1742     uint32_t signature[3];
1743     int kvm_base = KVM_CPUID_SIGNATURE;
1744     int max_nested_state_len;
1745     int r;
1746     Error *local_err = NULL;
1747 
1748     memset(&cpuid_data, 0, sizeof(cpuid_data));
1749 
1750     cpuid_i = 0;
1751 
1752     has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2);
1753 
1754     r = kvm_arch_set_tsc_khz(cs);
1755     if (r < 0) {
1756         return r;
1757     }
1758 
1759     /* vcpu's TSC frequency is either specified by user, or following
1760      * the value used by KVM if the former is not present. In the
1761      * latter case, we query it from KVM and record in env->tsc_khz,
1762      * so that vcpu's TSC frequency can be migrated later via this field.
1763      */
1764     if (!env->tsc_khz) {
1765         r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
1766             kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
1767             -ENOTSUP;
1768         if (r > 0) {
1769             env->tsc_khz = r;
1770         }
1771     }
1772 
1773     env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
1774 
1775     /*
1776      * kvm_hyperv_expand_features() is called here for the second time in case
1777      * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
1778      * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
1779      * check which Hyper-V enlightenments are supported and which are not, we
1780      * can still proceed and check/expand Hyper-V enlightenments here so legacy
1781      * behavior is preserved.
1782      */
1783     if (!kvm_hyperv_expand_features(cpu, &local_err)) {
1784         error_report_err(local_err);
1785         return -ENOSYS;
1786     }
1787 
1788     if (hyperv_enabled(cpu)) {
1789         r = hyperv_init_vcpu(cpu);
1790         if (r) {
1791             return r;
1792         }
1793 
1794         cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
1795         kvm_base = KVM_CPUID_SIGNATURE_NEXT;
1796         has_msr_hv_hypercall = true;
1797     }
1798 
1799     if (cpu->expose_kvm) {
1800         memcpy(signature, "KVMKVMKVM\0\0\0", 12);
1801         c = &cpuid_data.entries[cpuid_i++];
1802         c->function = KVM_CPUID_SIGNATURE | kvm_base;
1803         c->eax = KVM_CPUID_FEATURES | kvm_base;
1804         c->ebx = signature[0];
1805         c->ecx = signature[1];
1806         c->edx = signature[2];
1807 
1808         c = &cpuid_data.entries[cpuid_i++];
1809         c->function = KVM_CPUID_FEATURES | kvm_base;
1810         c->eax = env->features[FEAT_KVM];
1811         c->edx = env->features[FEAT_KVM_HINTS];
1812     }
1813 
1814     cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
1815 
1816     if (cpu->kvm_pv_enforce_cpuid) {
1817         r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
1818         if (r < 0) {
1819             fprintf(stderr,
1820                     "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
1821                     strerror(-r));
1822             abort();
1823         }
1824     }
1825 
1826     for (i = 0; i <= limit; i++) {
1827         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1828             fprintf(stderr, "unsupported level value: 0x%x\n", limit);
1829             abort();
1830         }
1831         c = &cpuid_data.entries[cpuid_i++];
1832 
1833         switch (i) {
1834         case 2: {
1835             /* Keep reading function 2 till all the input is received */
1836             int times;
1837 
1838             c->function = i;
1839             c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
1840                        KVM_CPUID_FLAG_STATE_READ_NEXT;
1841             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1842             times = c->eax & 0xff;
1843 
1844             for (j = 1; j < times; ++j) {
1845                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1846                     fprintf(stderr, "cpuid_data is full, no space for "
1847                             "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
1848                     abort();
1849                 }
1850                 c = &cpuid_data.entries[cpuid_i++];
1851                 c->function = i;
1852                 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
1853                 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1854             }
1855             break;
1856         }
1857         case 0x1f:
1858             if (env->nr_dies < 2) {
1859                 break;
1860             }
1861             /* fallthrough */
1862         case 4:
1863         case 0xb:
1864         case 0xd:
1865             for (j = 0; ; j++) {
1866                 if (i == 0xd && j == 64) {
1867                     break;
1868                 }
1869 
1870                 if (i == 0x1f && j == 64) {
1871                     break;
1872                 }
1873 
1874                 c->function = i;
1875                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1876                 c->index = j;
1877                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1878 
1879                 if (i == 4 && c->eax == 0) {
1880                     break;
1881                 }
1882                 if (i == 0xb && !(c->ecx & 0xff00)) {
1883                     break;
1884                 }
1885                 if (i == 0x1f && !(c->ecx & 0xff00)) {
1886                     break;
1887                 }
1888                 if (i == 0xd && c->eax == 0) {
1889                     continue;
1890                 }
1891                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1892                     fprintf(stderr, "cpuid_data is full, no space for "
1893                             "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1894                     abort();
1895                 }
1896                 c = &cpuid_data.entries[cpuid_i++];
1897             }
1898             break;
1899         case 0x7:
1900         case 0x12:
1901             for (j = 0; ; j++) {
1902                 c->function = i;
1903                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1904                 c->index = j;
1905                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1906 
1907                 if (j > 1 && (c->eax & 0xf) != 1) {
1908                     break;
1909                 }
1910 
1911                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1912                     fprintf(stderr, "cpuid_data is full, no space for "
1913                                 "cpuid(eax:0x12,ecx:0x%x)\n", j);
1914                     abort();
1915                 }
1916                 c = &cpuid_data.entries[cpuid_i++];
1917             }
1918             break;
1919         case 0x14:
1920         case 0x1d:
1921         case 0x1e: {
1922             uint32_t times;
1923 
1924             c->function = i;
1925             c->index = 0;
1926             c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1927             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1928             times = c->eax;
1929 
1930             for (j = 1; j <= times; ++j) {
1931                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1932                     fprintf(stderr, "cpuid_data is full, no space for "
1933                                 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1934                     abort();
1935                 }
1936                 c = &cpuid_data.entries[cpuid_i++];
1937                 c->function = i;
1938                 c->index = j;
1939                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1940                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1941             }
1942             break;
1943         }
1944         default:
1945             c->function = i;
1946             c->flags = 0;
1947             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1948             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1949                 /*
1950                  * KVM already returns all zeroes if a CPUID entry is missing,
1951                  * so we can omit it and avoid hitting KVM's 80-entry limit.
1952                  */
1953                 cpuid_i--;
1954             }
1955             break;
1956         }
1957     }
1958 
1959     if (limit >= 0x0a) {
1960         uint32_t eax, edx;
1961 
1962         cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
1963 
1964         has_architectural_pmu_version = eax & 0xff;
1965         if (has_architectural_pmu_version > 0) {
1966             num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
1967 
1968             /* Shouldn't be more than 32, since that's the number of bits
1969              * available in EBX to tell us _which_ counters are available.
1970              * Play it safe.
1971              */
1972             if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
1973                 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
1974             }
1975 
1976             if (has_architectural_pmu_version > 1) {
1977                 num_architectural_pmu_fixed_counters = edx & 0x1f;
1978 
1979                 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
1980                     num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
1981                 }
1982             }
1983         }
1984     }
1985 
1986     cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
1987 
1988     for (i = 0x80000000; i <= limit; i++) {
1989         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1990             fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
1991             abort();
1992         }
1993         c = &cpuid_data.entries[cpuid_i++];
1994 
1995         switch (i) {
1996         case 0x8000001d:
1997             /* Query for all AMD cache information leaves */
1998             for (j = 0; ; j++) {
1999                 c->function = i;
2000                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2001                 c->index = j;
2002                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
2003 
2004                 if (c->eax == 0) {
2005                     break;
2006                 }
2007                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
2008                     fprintf(stderr, "cpuid_data is full, no space for "
2009                             "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
2010                     abort();
2011                 }
2012                 c = &cpuid_data.entries[cpuid_i++];
2013             }
2014             break;
2015         default:
2016             c->function = i;
2017             c->flags = 0;
2018             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
2019             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
2020                 /*
2021                  * KVM already returns all zeroes if a CPUID entry is missing,
2022                  * so we can omit it and avoid hitting KVM's 80-entry limit.
2023                  */
2024                 cpuid_i--;
2025             }
2026             break;
2027         }
2028     }
2029 
2030     /* Call Centaur's CPUID instructions they are supported. */
2031     if (env->cpuid_xlevel2 > 0) {
2032         cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
2033 
2034         for (i = 0xC0000000; i <= limit; i++) {
2035             if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
2036                 fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
2037                 abort();
2038             }
2039             c = &cpuid_data.entries[cpuid_i++];
2040 
2041             c->function = i;
2042             c->flags = 0;
2043             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
2044         }
2045     }
2046 
2047     cpuid_data.cpuid.nent = cpuid_i;
2048 
2049     if (((env->cpuid_version >> 8)&0xF) >= 6
2050         && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
2051            (CPUID_MCE | CPUID_MCA)
2052         && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
2053         uint64_t mcg_cap, unsupported_caps;
2054         int banks;
2055         int ret;
2056 
2057         ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
2058         if (ret < 0) {
2059             fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
2060             return ret;
2061         }
2062 
2063         if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
2064             error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
2065                          (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
2066             return -ENOTSUP;
2067         }
2068 
2069         unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
2070         if (unsupported_caps) {
2071             if (unsupported_caps & MCG_LMCE_P) {
2072                 error_report("kvm: LMCE not supported");
2073                 return -ENOTSUP;
2074             }
2075             warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
2076                         unsupported_caps);
2077         }
2078 
2079         env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
2080         ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
2081         if (ret < 0) {
2082             fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
2083             return ret;
2084         }
2085     }
2086 
2087     cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env);
2088 
2089     c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
2090     if (c) {
2091         has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
2092                                   !!(c->ecx & CPUID_EXT_SMX);
2093     }
2094 
2095     c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0);
2096     if (c && (c->ebx & CPUID_7_0_EBX_SGX)) {
2097         has_msr_feature_control = true;
2098     }
2099 
2100     if (env->mcg_cap & MCG_LMCE_P) {
2101         has_msr_mcg_ext_ctl = has_msr_feature_control = true;
2102     }
2103 
2104     if (!env->user_tsc_khz) {
2105         if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
2106             invtsc_mig_blocker == NULL) {
2107             error_setg(&invtsc_mig_blocker,
2108                        "State blocked by non-migratable CPU device"
2109                        " (invtsc flag)");
2110             r = migrate_add_blocker(invtsc_mig_blocker, &local_err);
2111             if (r < 0) {
2112                 error_report_err(local_err);
2113                 return r;
2114             }
2115         }
2116     }
2117 
2118     if (cpu->vmware_cpuid_freq
2119         /* Guests depend on 0x40000000 to detect this feature, so only expose
2120          * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
2121         && cpu->expose_kvm
2122         && kvm_base == KVM_CPUID_SIGNATURE
2123         /* TSC clock must be stable and known for this feature. */
2124         && tsc_is_stable_and_known(env)) {
2125 
2126         c = &cpuid_data.entries[cpuid_i++];
2127         c->function = KVM_CPUID_SIGNATURE | 0x10;
2128         c->eax = env->tsc_khz;
2129         c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */
2130         c->ecx = c->edx = 0;
2131 
2132         c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
2133         c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
2134     }
2135 
2136     cpuid_data.cpuid.nent = cpuid_i;
2137 
2138     cpuid_data.cpuid.padding = 0;
2139     r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
2140     if (r) {
2141         goto fail;
2142     }
2143     kvm_init_xsave(env);
2144 
2145     max_nested_state_len = kvm_max_nested_state_length();
2146     if (max_nested_state_len > 0) {
2147         assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
2148 
2149         if (cpu_has_vmx(env) || cpu_has_svm(env)) {
2150             env->nested_state = g_malloc0(max_nested_state_len);
2151             env->nested_state->size = max_nested_state_len;
2152 
2153             kvm_init_nested_state(env);
2154         }
2155     }
2156 
2157     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
2158 
2159     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
2160         has_msr_tsc_aux = false;
2161     }
2162 
2163     kvm_init_msrs(cpu);
2164 
2165     return 0;
2166 
2167  fail:
2168     migrate_del_blocker(invtsc_mig_blocker);
2169 
2170     return r;
2171 }
2172 
2173 int kvm_arch_destroy_vcpu(CPUState *cs)
2174 {
2175     X86CPU *cpu = X86_CPU(cs);
2176     CPUX86State *env = &cpu->env;
2177 
2178     g_free(env->xsave_buf);
2179 
2180     g_free(cpu->kvm_msr_buf);
2181     cpu->kvm_msr_buf = NULL;
2182 
2183     g_free(env->nested_state);
2184     env->nested_state = NULL;
2185 
2186     qemu_del_vm_change_state_handler(cpu->vmsentry);
2187 
2188     return 0;
2189 }
2190 
2191 void kvm_arch_reset_vcpu(X86CPU *cpu)
2192 {
2193     CPUX86State *env = &cpu->env;
2194 
2195     env->xcr0 = 1;
2196     if (kvm_irqchip_in_kernel()) {
2197         env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
2198                                           KVM_MP_STATE_UNINITIALIZED;
2199     } else {
2200         env->mp_state = KVM_MP_STATE_RUNNABLE;
2201     }
2202 
2203     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
2204         int i;
2205         for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
2206             env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
2207         }
2208 
2209         hyperv_x86_synic_reset(cpu);
2210     }
2211     /* enabled by default */
2212     env->poll_control_msr = 1;
2213 
2214     kvm_init_nested_state(env);
2215 
2216     sev_es_set_reset_vector(CPU(cpu));
2217 }
2218 
2219 void kvm_arch_do_init_vcpu(X86CPU *cpu)
2220 {
2221     CPUX86State *env = &cpu->env;
2222 
2223     /* APs get directly into wait-for-SIPI state.  */
2224     if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
2225         env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
2226     }
2227 }
2228 
2229 static int kvm_get_supported_feature_msrs(KVMState *s)
2230 {
2231     int ret = 0;
2232 
2233     if (kvm_feature_msrs != NULL) {
2234         return 0;
2235     }
2236 
2237     if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
2238         return 0;
2239     }
2240 
2241     struct kvm_msr_list msr_list;
2242 
2243     msr_list.nmsrs = 0;
2244     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
2245     if (ret < 0 && ret != -E2BIG) {
2246         error_report("Fetch KVM feature MSR list failed: %s",
2247             strerror(-ret));
2248         return ret;
2249     }
2250 
2251     assert(msr_list.nmsrs > 0);
2252     kvm_feature_msrs = (struct kvm_msr_list *) \
2253         g_malloc0(sizeof(msr_list) +
2254                  msr_list.nmsrs * sizeof(msr_list.indices[0]));
2255 
2256     kvm_feature_msrs->nmsrs = msr_list.nmsrs;
2257     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
2258 
2259     if (ret < 0) {
2260         error_report("Fetch KVM feature MSR list failed: %s",
2261             strerror(-ret));
2262         g_free(kvm_feature_msrs);
2263         kvm_feature_msrs = NULL;
2264         return ret;
2265     }
2266 
2267     return 0;
2268 }
2269 
2270 static int kvm_get_supported_msrs(KVMState *s)
2271 {
2272     int ret = 0;
2273     struct kvm_msr_list msr_list, *kvm_msr_list;
2274 
2275     /*
2276      *  Obtain MSR list from KVM.  These are the MSRs that we must
2277      *  save/restore.
2278      */
2279     msr_list.nmsrs = 0;
2280     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
2281     if (ret < 0 && ret != -E2BIG) {
2282         return ret;
2283     }
2284     /*
2285      * Old kernel modules had a bug and could write beyond the provided
2286      * memory. Allocate at least a safe amount of 1K.
2287      */
2288     kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
2289                                           msr_list.nmsrs *
2290                                           sizeof(msr_list.indices[0])));
2291 
2292     kvm_msr_list->nmsrs = msr_list.nmsrs;
2293     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
2294     if (ret >= 0) {
2295         int i;
2296 
2297         for (i = 0; i < kvm_msr_list->nmsrs; i++) {
2298             switch (kvm_msr_list->indices[i]) {
2299             case MSR_STAR:
2300                 has_msr_star = true;
2301                 break;
2302             case MSR_VM_HSAVE_PA:
2303                 has_msr_hsave_pa = true;
2304                 break;
2305             case MSR_TSC_AUX:
2306                 has_msr_tsc_aux = true;
2307                 break;
2308             case MSR_TSC_ADJUST:
2309                 has_msr_tsc_adjust = true;
2310                 break;
2311             case MSR_IA32_TSCDEADLINE:
2312                 has_msr_tsc_deadline = true;
2313                 break;
2314             case MSR_IA32_SMBASE:
2315                 has_msr_smbase = true;
2316                 break;
2317             case MSR_SMI_COUNT:
2318                 has_msr_smi_count = true;
2319                 break;
2320             case MSR_IA32_MISC_ENABLE:
2321                 has_msr_misc_enable = true;
2322                 break;
2323             case MSR_IA32_BNDCFGS:
2324                 has_msr_bndcfgs = true;
2325                 break;
2326             case MSR_IA32_XSS:
2327                 has_msr_xss = true;
2328                 break;
2329             case MSR_IA32_UMWAIT_CONTROL:
2330                 has_msr_umwait = true;
2331                 break;
2332             case HV_X64_MSR_CRASH_CTL:
2333                 has_msr_hv_crash = true;
2334                 break;
2335             case HV_X64_MSR_RESET:
2336                 has_msr_hv_reset = true;
2337                 break;
2338             case HV_X64_MSR_VP_INDEX:
2339                 has_msr_hv_vpindex = true;
2340                 break;
2341             case HV_X64_MSR_VP_RUNTIME:
2342                 has_msr_hv_runtime = true;
2343                 break;
2344             case HV_X64_MSR_SCONTROL:
2345                 has_msr_hv_synic = true;
2346                 break;
2347             case HV_X64_MSR_STIMER0_CONFIG:
2348                 has_msr_hv_stimer = true;
2349                 break;
2350             case HV_X64_MSR_TSC_FREQUENCY:
2351                 has_msr_hv_frequencies = true;
2352                 break;
2353             case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2354                 has_msr_hv_reenlightenment = true;
2355                 break;
2356             case HV_X64_MSR_SYNDBG_OPTIONS:
2357                 has_msr_hv_syndbg_options = true;
2358                 break;
2359             case MSR_IA32_SPEC_CTRL:
2360                 has_msr_spec_ctrl = true;
2361                 break;
2362             case MSR_AMD64_TSC_RATIO:
2363                 has_tsc_scale_msr = true;
2364                 break;
2365             case MSR_IA32_TSX_CTRL:
2366                 has_msr_tsx_ctrl = true;
2367                 break;
2368             case MSR_VIRT_SSBD:
2369                 has_msr_virt_ssbd = true;
2370                 break;
2371             case MSR_IA32_ARCH_CAPABILITIES:
2372                 has_msr_arch_capabs = true;
2373                 break;
2374             case MSR_IA32_CORE_CAPABILITY:
2375                 has_msr_core_capabs = true;
2376                 break;
2377             case MSR_IA32_PERF_CAPABILITIES:
2378                 has_msr_perf_capabs = true;
2379                 break;
2380             case MSR_IA32_VMX_VMFUNC:
2381                 has_msr_vmx_vmfunc = true;
2382                 break;
2383             case MSR_IA32_UCODE_REV:
2384                 has_msr_ucode_rev = true;
2385                 break;
2386             case MSR_IA32_VMX_PROCBASED_CTLS2:
2387                 has_msr_vmx_procbased_ctls2 = true;
2388                 break;
2389             case MSR_IA32_PKRS:
2390                 has_msr_pkrs = true;
2391                 break;
2392             }
2393         }
2394     }
2395 
2396     g_free(kvm_msr_list);
2397 
2398     return ret;
2399 }
2400 
2401 static Notifier smram_machine_done;
2402 static KVMMemoryListener smram_listener;
2403 static AddressSpace smram_address_space;
2404 static MemoryRegion smram_as_root;
2405 static MemoryRegion smram_as_mem;
2406 
2407 static void register_smram_listener(Notifier *n, void *unused)
2408 {
2409     MemoryRegion *smram =
2410         (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
2411 
2412     /* Outer container... */
2413     memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
2414     memory_region_set_enabled(&smram_as_root, true);
2415 
2416     /* ... with two regions inside: normal system memory with low
2417      * priority, and...
2418      */
2419     memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
2420                              get_system_memory(), 0, ~0ull);
2421     memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
2422     memory_region_set_enabled(&smram_as_mem, true);
2423 
2424     if (smram) {
2425         /* ... SMRAM with higher priority */
2426         memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
2427         memory_region_set_enabled(smram, true);
2428     }
2429 
2430     address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
2431     kvm_memory_listener_register(kvm_state, &smram_listener,
2432                                  &smram_address_space, 1, "kvm-smram");
2433 }
2434 
2435 int kvm_arch_init(MachineState *ms, KVMState *s)
2436 {
2437     uint64_t identity_base = 0xfffbc000;
2438     uint64_t shadow_mem;
2439     int ret;
2440     struct utsname utsname;
2441     Error *local_err = NULL;
2442 
2443     /*
2444      * Initialize SEV context, if required
2445      *
2446      * If no memory encryption is requested (ms->cgs == NULL) this is
2447      * a no-op.
2448      *
2449      * It's also a no-op if a non-SEV confidential guest support
2450      * mechanism is selected.  SEV is the only mechanism available to
2451      * select on x86 at present, so this doesn't arise, but if new
2452      * mechanisms are supported in future (e.g. TDX), they'll need
2453      * their own initialization either here or elsewhere.
2454      */
2455     ret = sev_kvm_init(ms->cgs, &local_err);
2456     if (ret < 0) {
2457         error_report_err(local_err);
2458         return ret;
2459     }
2460 
2461     if (!kvm_check_extension(s, KVM_CAP_IRQ_ROUTING)) {
2462         error_report("kvm: KVM_CAP_IRQ_ROUTING not supported by KVM");
2463         return -ENOTSUP;
2464     }
2465 
2466     has_xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
2467     has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
2468     has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
2469     has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0;
2470 
2471     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
2472 
2473     has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
2474     if (has_exception_payload) {
2475         ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
2476         if (ret < 0) {
2477             error_report("kvm: Failed to enable exception payload cap: %s",
2478                          strerror(-ret));
2479             return ret;
2480         }
2481     }
2482 
2483     has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT);
2484     if (has_triple_fault_event) {
2485         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true);
2486         if (ret < 0) {
2487             error_report("kvm: Failed to enable triple fault event cap: %s",
2488                          strerror(-ret));
2489             return ret;
2490         }
2491     }
2492 
2493     ret = kvm_get_supported_msrs(s);
2494     if (ret < 0) {
2495         return ret;
2496     }
2497 
2498     kvm_get_supported_feature_msrs(s);
2499 
2500     uname(&utsname);
2501     lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
2502 
2503     /*
2504      * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
2505      * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
2506      * Since these must be part of guest physical memory, we need to allocate
2507      * them, both by setting their start addresses in the kernel and by
2508      * creating a corresponding e820 entry. We need 4 pages before the BIOS.
2509      *
2510      * Older KVM versions may not support setting the identity map base. In
2511      * that case we need to stick with the default, i.e. a 256K maximum BIOS
2512      * size.
2513      */
2514     if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
2515         /* Allows up to 16M BIOSes. */
2516         identity_base = 0xfeffc000;
2517 
2518         ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
2519         if (ret < 0) {
2520             return ret;
2521         }
2522     }
2523 
2524     /* Set TSS base one page after EPT identity map. */
2525     ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
2526     if (ret < 0) {
2527         return ret;
2528     }
2529 
2530     /* Tell fw_cfg to notify the BIOS to reserve the range. */
2531     ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
2532     if (ret < 0) {
2533         fprintf(stderr, "e820_add_entry() table is full\n");
2534         return ret;
2535     }
2536 
2537     shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
2538     if (shadow_mem != -1) {
2539         shadow_mem /= 4096;
2540         ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
2541         if (ret < 0) {
2542             return ret;
2543         }
2544     }
2545 
2546     if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
2547         object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
2548         x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
2549         smram_machine_done.notify = register_smram_listener;
2550         qemu_add_machine_init_done_notifier(&smram_machine_done);
2551     }
2552 
2553     if (enable_cpu_pm) {
2554         int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
2555         int ret;
2556 
2557 /* Work around for kernel header with a typo. TODO: fix header and drop. */
2558 #if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
2559 #define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
2560 #endif
2561         if (disable_exits) {
2562             disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
2563                               KVM_X86_DISABLE_EXITS_HLT |
2564                               KVM_X86_DISABLE_EXITS_PAUSE |
2565                               KVM_X86_DISABLE_EXITS_CSTATE);
2566         }
2567 
2568         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
2569                                 disable_exits);
2570         if (ret < 0) {
2571             error_report("kvm: guest stopping CPU not supported: %s",
2572                          strerror(-ret));
2573         }
2574     }
2575 
2576     if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
2577         X86MachineState *x86ms = X86_MACHINE(ms);
2578 
2579         if (x86ms->bus_lock_ratelimit > 0) {
2580             ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
2581             if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
2582                 error_report("kvm: bus lock detection unsupported");
2583                 return -ENOTSUP;
2584             }
2585             ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
2586                                     KVM_BUS_LOCK_DETECTION_EXIT);
2587             if (ret < 0) {
2588                 error_report("kvm: Failed to enable bus lock detection cap: %s",
2589                              strerror(-ret));
2590                 return ret;
2591             }
2592             ratelimit_init(&bus_lock_ratelimit_ctrl);
2593             ratelimit_set_speed(&bus_lock_ratelimit_ctrl,
2594                                 x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME);
2595         }
2596     }
2597 
2598     return 0;
2599 }
2600 
2601 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2602 {
2603     lhs->selector = rhs->selector;
2604     lhs->base = rhs->base;
2605     lhs->limit = rhs->limit;
2606     lhs->type = 3;
2607     lhs->present = 1;
2608     lhs->dpl = 3;
2609     lhs->db = 0;
2610     lhs->s = 1;
2611     lhs->l = 0;
2612     lhs->g = 0;
2613     lhs->avl = 0;
2614     lhs->unusable = 0;
2615 }
2616 
2617 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2618 {
2619     unsigned flags = rhs->flags;
2620     lhs->selector = rhs->selector;
2621     lhs->base = rhs->base;
2622     lhs->limit = rhs->limit;
2623     lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
2624     lhs->present = (flags & DESC_P_MASK) != 0;
2625     lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
2626     lhs->db = (flags >> DESC_B_SHIFT) & 1;
2627     lhs->s = (flags & DESC_S_MASK) != 0;
2628     lhs->l = (flags >> DESC_L_SHIFT) & 1;
2629     lhs->g = (flags & DESC_G_MASK) != 0;
2630     lhs->avl = (flags & DESC_AVL_MASK) != 0;
2631     lhs->unusable = !lhs->present;
2632     lhs->padding = 0;
2633 }
2634 
2635 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
2636 {
2637     lhs->selector = rhs->selector;
2638     lhs->base = rhs->base;
2639     lhs->limit = rhs->limit;
2640     lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
2641                  ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
2642                  (rhs->dpl << DESC_DPL_SHIFT) |
2643                  (rhs->db << DESC_B_SHIFT) |
2644                  (rhs->s * DESC_S_MASK) |
2645                  (rhs->l << DESC_L_SHIFT) |
2646                  (rhs->g * DESC_G_MASK) |
2647                  (rhs->avl * DESC_AVL_MASK);
2648 }
2649 
2650 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
2651 {
2652     if (set) {
2653         *kvm_reg = *qemu_reg;
2654     } else {
2655         *qemu_reg = *kvm_reg;
2656     }
2657 }
2658 
2659 static int kvm_getput_regs(X86CPU *cpu, int set)
2660 {
2661     CPUX86State *env = &cpu->env;
2662     struct kvm_regs regs;
2663     int ret = 0;
2664 
2665     if (!set) {
2666         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
2667         if (ret < 0) {
2668             return ret;
2669         }
2670     }
2671 
2672     kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
2673     kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
2674     kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
2675     kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
2676     kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
2677     kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
2678     kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
2679     kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
2680 #ifdef TARGET_X86_64
2681     kvm_getput_reg(&regs.r8, &env->regs[8], set);
2682     kvm_getput_reg(&regs.r9, &env->regs[9], set);
2683     kvm_getput_reg(&regs.r10, &env->regs[10], set);
2684     kvm_getput_reg(&regs.r11, &env->regs[11], set);
2685     kvm_getput_reg(&regs.r12, &env->regs[12], set);
2686     kvm_getput_reg(&regs.r13, &env->regs[13], set);
2687     kvm_getput_reg(&regs.r14, &env->regs[14], set);
2688     kvm_getput_reg(&regs.r15, &env->regs[15], set);
2689 #endif
2690 
2691     kvm_getput_reg(&regs.rflags, &env->eflags, set);
2692     kvm_getput_reg(&regs.rip, &env->eip, set);
2693 
2694     if (set) {
2695         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
2696     }
2697 
2698     return ret;
2699 }
2700 
2701 static int kvm_put_fpu(X86CPU *cpu)
2702 {
2703     CPUX86State *env = &cpu->env;
2704     struct kvm_fpu fpu;
2705     int i;
2706 
2707     memset(&fpu, 0, sizeof fpu);
2708     fpu.fsw = env->fpus & ~(7 << 11);
2709     fpu.fsw |= (env->fpstt & 7) << 11;
2710     fpu.fcw = env->fpuc;
2711     fpu.last_opcode = env->fpop;
2712     fpu.last_ip = env->fpip;
2713     fpu.last_dp = env->fpdp;
2714     for (i = 0; i < 8; ++i) {
2715         fpu.ftwx |= (!env->fptags[i]) << i;
2716     }
2717     memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
2718     for (i = 0; i < CPU_NB_REGS; i++) {
2719         stq_p(&fpu.xmm[i][0], env->xmm_regs[i].ZMM_Q(0));
2720         stq_p(&fpu.xmm[i][8], env->xmm_regs[i].ZMM_Q(1));
2721     }
2722     fpu.mxcsr = env->mxcsr;
2723 
2724     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
2725 }
2726 
2727 static int kvm_put_xsave(X86CPU *cpu)
2728 {
2729     CPUX86State *env = &cpu->env;
2730     void *xsave = env->xsave_buf;
2731 
2732     if (!has_xsave) {
2733         return kvm_put_fpu(cpu);
2734     }
2735     x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len);
2736 
2737     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
2738 }
2739 
2740 static int kvm_put_xcrs(X86CPU *cpu)
2741 {
2742     CPUX86State *env = &cpu->env;
2743     struct kvm_xcrs xcrs = {};
2744 
2745     if (!has_xcrs) {
2746         return 0;
2747     }
2748 
2749     xcrs.nr_xcrs = 1;
2750     xcrs.flags = 0;
2751     xcrs.xcrs[0].xcr = 0;
2752     xcrs.xcrs[0].value = env->xcr0;
2753     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
2754 }
2755 
2756 static int kvm_put_sregs(X86CPU *cpu)
2757 {
2758     CPUX86State *env = &cpu->env;
2759     struct kvm_sregs sregs;
2760 
2761     /*
2762      * The interrupt_bitmap is ignored because KVM_SET_SREGS is
2763      * always followed by KVM_SET_VCPU_EVENTS.
2764      */
2765     memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
2766 
2767     if ((env->eflags & VM_MASK)) {
2768         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2769         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2770         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2771         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2772         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2773         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2774     } else {
2775         set_seg(&sregs.cs, &env->segs[R_CS]);
2776         set_seg(&sregs.ds, &env->segs[R_DS]);
2777         set_seg(&sregs.es, &env->segs[R_ES]);
2778         set_seg(&sregs.fs, &env->segs[R_FS]);
2779         set_seg(&sregs.gs, &env->segs[R_GS]);
2780         set_seg(&sregs.ss, &env->segs[R_SS]);
2781     }
2782 
2783     set_seg(&sregs.tr, &env->tr);
2784     set_seg(&sregs.ldt, &env->ldt);
2785 
2786     sregs.idt.limit = env->idt.limit;
2787     sregs.idt.base = env->idt.base;
2788     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2789     sregs.gdt.limit = env->gdt.limit;
2790     sregs.gdt.base = env->gdt.base;
2791     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2792 
2793     sregs.cr0 = env->cr[0];
2794     sregs.cr2 = env->cr[2];
2795     sregs.cr3 = env->cr[3];
2796     sregs.cr4 = env->cr[4];
2797 
2798     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2799     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2800 
2801     sregs.efer = env->efer;
2802 
2803     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
2804 }
2805 
2806 static int kvm_put_sregs2(X86CPU *cpu)
2807 {
2808     CPUX86State *env = &cpu->env;
2809     struct kvm_sregs2 sregs;
2810     int i;
2811 
2812     sregs.flags = 0;
2813 
2814     if ((env->eflags & VM_MASK)) {
2815         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2816         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2817         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2818         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2819         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2820         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2821     } else {
2822         set_seg(&sregs.cs, &env->segs[R_CS]);
2823         set_seg(&sregs.ds, &env->segs[R_DS]);
2824         set_seg(&sregs.es, &env->segs[R_ES]);
2825         set_seg(&sregs.fs, &env->segs[R_FS]);
2826         set_seg(&sregs.gs, &env->segs[R_GS]);
2827         set_seg(&sregs.ss, &env->segs[R_SS]);
2828     }
2829 
2830     set_seg(&sregs.tr, &env->tr);
2831     set_seg(&sregs.ldt, &env->ldt);
2832 
2833     sregs.idt.limit = env->idt.limit;
2834     sregs.idt.base = env->idt.base;
2835     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2836     sregs.gdt.limit = env->gdt.limit;
2837     sregs.gdt.base = env->gdt.base;
2838     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2839 
2840     sregs.cr0 = env->cr[0];
2841     sregs.cr2 = env->cr[2];
2842     sregs.cr3 = env->cr[3];
2843     sregs.cr4 = env->cr[4];
2844 
2845     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2846     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2847 
2848     sregs.efer = env->efer;
2849 
2850     if (env->pdptrs_valid) {
2851         for (i = 0; i < 4; i++) {
2852             sregs.pdptrs[i] = env->pdptrs[i];
2853         }
2854         sregs.flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
2855     }
2856 
2857     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS2, &sregs);
2858 }
2859 
2860 
2861 static void kvm_msr_buf_reset(X86CPU *cpu)
2862 {
2863     memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
2864 }
2865 
2866 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
2867 {
2868     struct kvm_msrs *msrs = cpu->kvm_msr_buf;
2869     void *limit = ((void *)msrs) + MSR_BUF_SIZE;
2870     struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
2871 
2872     assert((void *)(entry + 1) <= limit);
2873 
2874     entry->index = index;
2875     entry->reserved = 0;
2876     entry->data = value;
2877     msrs->nmsrs++;
2878 }
2879 
2880 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
2881 {
2882     kvm_msr_buf_reset(cpu);
2883     kvm_msr_entry_add(cpu, index, value);
2884 
2885     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2886 }
2887 
2888 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value)
2889 {
2890     int ret;
2891     struct {
2892         struct kvm_msrs info;
2893         struct kvm_msr_entry entries[1];
2894     } msr_data = {
2895         .info.nmsrs = 1,
2896         .entries[0].index = index,
2897     };
2898 
2899     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
2900     if (ret < 0) {
2901         return ret;
2902     }
2903     assert(ret == 1);
2904     *value = msr_data.entries[0].data;
2905     return ret;
2906 }
2907 void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
2908 {
2909     int ret;
2910 
2911     ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
2912     assert(ret == 1);
2913 }
2914 
2915 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
2916 {
2917     CPUX86State *env = &cpu->env;
2918     int ret;
2919 
2920     if (!has_msr_tsc_deadline) {
2921         return 0;
2922     }
2923 
2924     ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
2925     if (ret < 0) {
2926         return ret;
2927     }
2928 
2929     assert(ret == 1);
2930     return 0;
2931 }
2932 
2933 /*
2934  * Provide a separate write service for the feature control MSR in order to
2935  * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
2936  * before writing any other state because forcibly leaving nested mode
2937  * invalidates the VCPU state.
2938  */
2939 static int kvm_put_msr_feature_control(X86CPU *cpu)
2940 {
2941     int ret;
2942 
2943     if (!has_msr_feature_control) {
2944         return 0;
2945     }
2946 
2947     ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
2948                           cpu->env.msr_ia32_feature_control);
2949     if (ret < 0) {
2950         return ret;
2951     }
2952 
2953     assert(ret == 1);
2954     return 0;
2955 }
2956 
2957 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features)
2958 {
2959     uint32_t default1, can_be_one, can_be_zero;
2960     uint32_t must_be_one;
2961 
2962     switch (index) {
2963     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2964         default1 = 0x00000016;
2965         break;
2966     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2967         default1 = 0x0401e172;
2968         break;
2969     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2970         default1 = 0x000011ff;
2971         break;
2972     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2973         default1 = 0x00036dff;
2974         break;
2975     case MSR_IA32_VMX_PROCBASED_CTLS2:
2976         default1 = 0;
2977         break;
2978     default:
2979         abort();
2980     }
2981 
2982     /* If a feature bit is set, the control can be either set or clear.
2983      * Otherwise the value is limited to either 0 or 1 by default1.
2984      */
2985     can_be_one = features | default1;
2986     can_be_zero = features | ~default1;
2987     must_be_one = ~can_be_zero;
2988 
2989     /*
2990      * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one).
2991      * Bit 32:63 -> 1 if the control bit can be one.
2992      */
2993     return must_be_one | (((uint64_t)can_be_one) << 32);
2994 }
2995 
2996 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
2997 {
2998     uint64_t kvm_vmx_basic =
2999         kvm_arch_get_supported_msr_feature(kvm_state,
3000                                            MSR_IA32_VMX_BASIC);
3001 
3002     if (!kvm_vmx_basic) {
3003         /* If the kernel doesn't support VMX feature (kvm_intel.nested=0),
3004          * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail.
3005          */
3006         return;
3007     }
3008 
3009     uint64_t kvm_vmx_misc =
3010         kvm_arch_get_supported_msr_feature(kvm_state,
3011                                            MSR_IA32_VMX_MISC);
3012     uint64_t kvm_vmx_ept_vpid =
3013         kvm_arch_get_supported_msr_feature(kvm_state,
3014                                            MSR_IA32_VMX_EPT_VPID_CAP);
3015 
3016     /*
3017      * If the guest is 64-bit, a value of 1 is allowed for the host address
3018      * space size vmexit control.
3019      */
3020     uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM
3021         ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0;
3022 
3023     /*
3024      * Bits 0-30, 32-44 and 50-53 come from the host.  KVM should
3025      * not change them for backwards compatibility.
3026      */
3027     uint64_t fixed_vmx_basic = kvm_vmx_basic &
3028         (MSR_VMX_BASIC_VMCS_REVISION_MASK |
3029          MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK |
3030          MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK);
3031 
3032     /*
3033      * Same for bits 0-4 and 25-27.  Bits 16-24 (CR3 target count) can
3034      * change in the future but are always zero for now, clear them to be
3035      * future proof.  Bits 32-63 in theory could change, though KVM does
3036      * not support dual-monitor treatment and probably never will; mask
3037      * them out as well.
3038      */
3039     uint64_t fixed_vmx_misc = kvm_vmx_misc &
3040         (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK |
3041          MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK);
3042 
3043     /*
3044      * EPT memory types should not change either, so we do not bother
3045      * adding features for them.
3046      */
3047     uint64_t fixed_vmx_ept_mask =
3048             (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ?
3049              MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0);
3050     uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask;
3051 
3052     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
3053                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
3054                                          f[FEAT_VMX_PROCBASED_CTLS]));
3055     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
3056                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS,
3057                                          f[FEAT_VMX_PINBASED_CTLS]));
3058     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS,
3059                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS,
3060                                          f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit);
3061     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
3062                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS,
3063                                          f[FEAT_VMX_ENTRY_CTLS]));
3064     kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2,
3065                       make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2,
3066                                          f[FEAT_VMX_SECONDARY_CTLS]));
3067     kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP,
3068                       f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid);
3069     kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC,
3070                       f[FEAT_VMX_BASIC] | fixed_vmx_basic);
3071     kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC,
3072                       f[FEAT_VMX_MISC] | fixed_vmx_misc);
3073     if (has_msr_vmx_vmfunc) {
3074         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]);
3075     }
3076 
3077     /*
3078      * Just to be safe, write these with constant values.  The CRn_FIXED1
3079      * MSRs are generated by KVM based on the vCPU's CPUID.
3080      */
3081     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0,
3082                       CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK);
3083     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
3084                       CR4_VMXE_MASK);
3085 
3086     if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
3087         /* TSC multiplier (0x2032).  */
3088         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
3089     } else {
3090         /* Preemption timer (0x482E).  */
3091         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E);
3092     }
3093 }
3094 
3095 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f)
3096 {
3097     uint64_t kvm_perf_cap =
3098         kvm_arch_get_supported_msr_feature(kvm_state,
3099                                            MSR_IA32_PERF_CAPABILITIES);
3100 
3101     if (kvm_perf_cap) {
3102         kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES,
3103                         kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]);
3104     }
3105 }
3106 
3107 static int kvm_buf_set_msrs(X86CPU *cpu)
3108 {
3109     int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
3110     if (ret < 0) {
3111         return ret;
3112     }
3113 
3114     if (ret < cpu->kvm_msr_buf->nmsrs) {
3115         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
3116         error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
3117                      (uint32_t)e->index, (uint64_t)e->data);
3118     }
3119 
3120     assert(ret == cpu->kvm_msr_buf->nmsrs);
3121     return 0;
3122 }
3123 
3124 static void kvm_init_msrs(X86CPU *cpu)
3125 {
3126     CPUX86State *env = &cpu->env;
3127 
3128     kvm_msr_buf_reset(cpu);
3129     if (has_msr_arch_capabs) {
3130         kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
3131                           env->features[FEAT_ARCH_CAPABILITIES]);
3132     }
3133 
3134     if (has_msr_core_capabs) {
3135         kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
3136                           env->features[FEAT_CORE_CAPABILITY]);
3137     }
3138 
3139     if (has_msr_perf_capabs && cpu->enable_pmu) {
3140         kvm_msr_entry_add_perf(cpu, env->features);
3141     }
3142 
3143     if (has_msr_ucode_rev) {
3144         kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
3145     }
3146 
3147     /*
3148      * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
3149      * all kernels with MSR features should have them.
3150      */
3151     if (kvm_feature_msrs && cpu_has_vmx(env)) {
3152         kvm_msr_entry_add_vmx(cpu, env->features);
3153     }
3154 
3155     assert(kvm_buf_set_msrs(cpu) == 0);
3156 }
3157 
3158 static int kvm_put_msrs(X86CPU *cpu, int level)
3159 {
3160     CPUX86State *env = &cpu->env;
3161     int i;
3162 
3163     kvm_msr_buf_reset(cpu);
3164 
3165     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
3166     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
3167     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
3168     kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
3169     if (has_msr_star) {
3170         kvm_msr_entry_add(cpu, MSR_STAR, env->star);
3171     }
3172     if (has_msr_hsave_pa) {
3173         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
3174     }
3175     if (has_msr_tsc_aux) {
3176         kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
3177     }
3178     if (has_msr_tsc_adjust) {
3179         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
3180     }
3181     if (has_msr_misc_enable) {
3182         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
3183                           env->msr_ia32_misc_enable);
3184     }
3185     if (has_msr_smbase) {
3186         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
3187     }
3188     if (has_msr_smi_count) {
3189         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
3190     }
3191     if (has_msr_pkrs) {
3192         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs);
3193     }
3194     if (has_msr_bndcfgs) {
3195         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
3196     }
3197     if (has_msr_xss) {
3198         kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
3199     }
3200     if (has_msr_umwait) {
3201         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait);
3202     }
3203     if (has_msr_spec_ctrl) {
3204         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
3205     }
3206     if (has_tsc_scale_msr) {
3207         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr);
3208     }
3209 
3210     if (has_msr_tsx_ctrl) {
3211         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl);
3212     }
3213     if (has_msr_virt_ssbd) {
3214         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
3215     }
3216 
3217 #ifdef TARGET_X86_64
3218     if (lm_capable_kernel) {
3219         kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
3220         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
3221         kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
3222         kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
3223     }
3224 #endif
3225 
3226     /*
3227      * The following MSRs have side effects on the guest or are too heavy
3228      * for normal writeback. Limit them to reset or full state updates.
3229      */
3230     if (level >= KVM_PUT_RESET_STATE) {
3231         kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
3232         kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
3233         kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
3234         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3235             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
3236         }
3237         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3238             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
3239         }
3240         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3241             kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
3242         }
3243         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3244             kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
3245         }
3246 
3247         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3248             kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
3249         }
3250 
3251         if (has_architectural_pmu_version > 0) {
3252             if (has_architectural_pmu_version > 1) {
3253                 /* Stop the counter.  */
3254                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3255                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3256             }
3257 
3258             /* Set the counter values.  */
3259             for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3260                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
3261                                   env->msr_fixed_counters[i]);
3262             }
3263             for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3264                 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
3265                                   env->msr_gp_counters[i]);
3266                 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
3267                                   env->msr_gp_evtsel[i]);
3268             }
3269             if (has_architectural_pmu_version > 1) {
3270                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
3271                                   env->msr_global_status);
3272                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
3273                                   env->msr_global_ovf_ctrl);
3274 
3275                 /* Now start the PMU.  */
3276                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
3277                                   env->msr_fixed_ctr_ctrl);
3278                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
3279                                   env->msr_global_ctrl);
3280             }
3281         }
3282         /*
3283          * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
3284          * only sync them to KVM on the first cpu
3285          */
3286         if (current_cpu == first_cpu) {
3287             if (has_msr_hv_hypercall) {
3288                 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
3289                                   env->msr_hv_guest_os_id);
3290                 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
3291                                   env->msr_hv_hypercall);
3292             }
3293             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3294                 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
3295                                   env->msr_hv_tsc);
3296             }
3297             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3298                 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
3299                                   env->msr_hv_reenlightenment_control);
3300                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
3301                                   env->msr_hv_tsc_emulation_control);
3302                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
3303                                   env->msr_hv_tsc_emulation_status);
3304             }
3305 #ifdef CONFIG_SYNDBG
3306             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) &&
3307                 has_msr_hv_syndbg_options) {
3308                 kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS,
3309                                   hyperv_syndbg_query_options());
3310             }
3311 #endif
3312         }
3313         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3314             kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
3315                               env->msr_hv_vapic);
3316         }
3317         if (has_msr_hv_crash) {
3318             int j;
3319 
3320             for (j = 0; j < HV_CRASH_PARAMS; j++)
3321                 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
3322                                   env->msr_hv_crash_params[j]);
3323 
3324             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
3325         }
3326         if (has_msr_hv_runtime) {
3327             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
3328         }
3329         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
3330             && hv_vpindex_settable) {
3331             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
3332                               hyperv_vp_index(CPU(cpu)));
3333         }
3334         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3335             int j;
3336 
3337             kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
3338 
3339             kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
3340                               env->msr_hv_synic_control);
3341             kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
3342                               env->msr_hv_synic_evt_page);
3343             kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
3344                               env->msr_hv_synic_msg_page);
3345 
3346             for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
3347                 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
3348                                   env->msr_hv_synic_sint[j]);
3349             }
3350         }
3351         if (has_msr_hv_stimer) {
3352             int j;
3353 
3354             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
3355                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
3356                                 env->msr_hv_stimer_config[j]);
3357             }
3358 
3359             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
3360                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
3361                                 env->msr_hv_stimer_count[j]);
3362             }
3363         }
3364         if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3365             uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
3366 
3367             kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
3368             kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
3369             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
3370             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
3371             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
3372             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
3373             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
3374             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
3375             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
3376             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
3377             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
3378             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
3379             for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3380                 /* The CPU GPs if we write to a bit above the physical limit of
3381                  * the host CPU (and KVM emulates that)
3382                  */
3383                 uint64_t mask = env->mtrr_var[i].mask;
3384                 mask &= phys_mask;
3385 
3386                 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
3387                                   env->mtrr_var[i].base);
3388                 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
3389             }
3390         }
3391         if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3392             int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
3393                                                     0x14, 1, R_EAX) & 0x7;
3394 
3395             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
3396                             env->msr_rtit_ctrl);
3397             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
3398                             env->msr_rtit_status);
3399             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
3400                             env->msr_rtit_output_base);
3401             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
3402                             env->msr_rtit_output_mask);
3403             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
3404                             env->msr_rtit_cr3_match);
3405             for (i = 0; i < addr_num; i++) {
3406                 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
3407                             env->msr_rtit_addrs[i]);
3408             }
3409         }
3410 
3411         if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3412             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0,
3413                               env->msr_ia32_sgxlepubkeyhash[0]);
3414             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1,
3415                               env->msr_ia32_sgxlepubkeyhash[1]);
3416             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2,
3417                               env->msr_ia32_sgxlepubkeyhash[2]);
3418             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3,
3419                               env->msr_ia32_sgxlepubkeyhash[3]);
3420         }
3421 
3422         if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
3423             kvm_msr_entry_add(cpu, MSR_IA32_XFD,
3424                               env->msr_xfd);
3425             kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR,
3426                               env->msr_xfd_err);
3427         }
3428 
3429         if (kvm_enabled() && cpu->enable_pmu &&
3430             (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
3431             uint64_t depth;
3432             int i, ret;
3433 
3434             /*
3435              * Only migrate Arch LBR states when the host Arch LBR depth
3436              * equals that of source guest's, this is to avoid mismatch
3437              * of guest/host config for the msr hence avoid unexpected
3438              * misbehavior.
3439              */
3440             ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
3441 
3442             if (ret == 1 && !!depth && depth == env->msr_lbr_depth) {
3443                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, env->msr_lbr_ctl);
3444                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, env->msr_lbr_depth);
3445 
3446                 for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
3447                     if (!env->lbr_records[i].from) {
3448                         continue;
3449                     }
3450                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i,
3451                                       env->lbr_records[i].from);
3452                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i,
3453                                       env->lbr_records[i].to);
3454                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i,
3455                                       env->lbr_records[i].info);
3456                 }
3457             }
3458         }
3459 
3460         /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
3461          *       kvm_put_msr_feature_control. */
3462     }
3463 
3464     if (env->mcg_cap) {
3465         int i;
3466 
3467         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
3468         kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
3469         if (has_msr_mcg_ext_ctl) {
3470             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
3471         }
3472         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3473             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
3474         }
3475     }
3476 
3477     return kvm_buf_set_msrs(cpu);
3478 }
3479 
3480 
3481 static int kvm_get_fpu(X86CPU *cpu)
3482 {
3483     CPUX86State *env = &cpu->env;
3484     struct kvm_fpu fpu;
3485     int i, ret;
3486 
3487     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
3488     if (ret < 0) {
3489         return ret;
3490     }
3491 
3492     env->fpstt = (fpu.fsw >> 11) & 7;
3493     env->fpus = fpu.fsw;
3494     env->fpuc = fpu.fcw;
3495     env->fpop = fpu.last_opcode;
3496     env->fpip = fpu.last_ip;
3497     env->fpdp = fpu.last_dp;
3498     for (i = 0; i < 8; ++i) {
3499         env->fptags[i] = !((fpu.ftwx >> i) & 1);
3500     }
3501     memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
3502     for (i = 0; i < CPU_NB_REGS; i++) {
3503         env->xmm_regs[i].ZMM_Q(0) = ldq_p(&fpu.xmm[i][0]);
3504         env->xmm_regs[i].ZMM_Q(1) = ldq_p(&fpu.xmm[i][8]);
3505     }
3506     env->mxcsr = fpu.mxcsr;
3507 
3508     return 0;
3509 }
3510 
3511 static int kvm_get_xsave(X86CPU *cpu)
3512 {
3513     CPUX86State *env = &cpu->env;
3514     void *xsave = env->xsave_buf;
3515     int type, ret;
3516 
3517     if (!has_xsave) {
3518         return kvm_get_fpu(cpu);
3519     }
3520 
3521     type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE;
3522     ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave);
3523     if (ret < 0) {
3524         return ret;
3525     }
3526     x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len);
3527 
3528     return 0;
3529 }
3530 
3531 static int kvm_get_xcrs(X86CPU *cpu)
3532 {
3533     CPUX86State *env = &cpu->env;
3534     int i, ret;
3535     struct kvm_xcrs xcrs;
3536 
3537     if (!has_xcrs) {
3538         return 0;
3539     }
3540 
3541     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
3542     if (ret < 0) {
3543         return ret;
3544     }
3545 
3546     for (i = 0; i < xcrs.nr_xcrs; i++) {
3547         /* Only support xcr0 now */
3548         if (xcrs.xcrs[i].xcr == 0) {
3549             env->xcr0 = xcrs.xcrs[i].value;
3550             break;
3551         }
3552     }
3553     return 0;
3554 }
3555 
3556 static int kvm_get_sregs(X86CPU *cpu)
3557 {
3558     CPUX86State *env = &cpu->env;
3559     struct kvm_sregs sregs;
3560     int ret;
3561 
3562     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
3563     if (ret < 0) {
3564         return ret;
3565     }
3566 
3567     /*
3568      * The interrupt_bitmap is ignored because KVM_GET_SREGS is
3569      * always preceded by KVM_GET_VCPU_EVENTS.
3570      */
3571 
3572     get_seg(&env->segs[R_CS], &sregs.cs);
3573     get_seg(&env->segs[R_DS], &sregs.ds);
3574     get_seg(&env->segs[R_ES], &sregs.es);
3575     get_seg(&env->segs[R_FS], &sregs.fs);
3576     get_seg(&env->segs[R_GS], &sregs.gs);
3577     get_seg(&env->segs[R_SS], &sregs.ss);
3578 
3579     get_seg(&env->tr, &sregs.tr);
3580     get_seg(&env->ldt, &sregs.ldt);
3581 
3582     env->idt.limit = sregs.idt.limit;
3583     env->idt.base = sregs.idt.base;
3584     env->gdt.limit = sregs.gdt.limit;
3585     env->gdt.base = sregs.gdt.base;
3586 
3587     env->cr[0] = sregs.cr0;
3588     env->cr[2] = sregs.cr2;
3589     env->cr[3] = sregs.cr3;
3590     env->cr[4] = sregs.cr4;
3591 
3592     env->efer = sregs.efer;
3593 
3594     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3595     x86_update_hflags(env);
3596 
3597     return 0;
3598 }
3599 
3600 static int kvm_get_sregs2(X86CPU *cpu)
3601 {
3602     CPUX86State *env = &cpu->env;
3603     struct kvm_sregs2 sregs;
3604     int i, ret;
3605 
3606     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
3607     if (ret < 0) {
3608         return ret;
3609     }
3610 
3611     get_seg(&env->segs[R_CS], &sregs.cs);
3612     get_seg(&env->segs[R_DS], &sregs.ds);
3613     get_seg(&env->segs[R_ES], &sregs.es);
3614     get_seg(&env->segs[R_FS], &sregs.fs);
3615     get_seg(&env->segs[R_GS], &sregs.gs);
3616     get_seg(&env->segs[R_SS], &sregs.ss);
3617 
3618     get_seg(&env->tr, &sregs.tr);
3619     get_seg(&env->ldt, &sregs.ldt);
3620 
3621     env->idt.limit = sregs.idt.limit;
3622     env->idt.base = sregs.idt.base;
3623     env->gdt.limit = sregs.gdt.limit;
3624     env->gdt.base = sregs.gdt.base;
3625 
3626     env->cr[0] = sregs.cr0;
3627     env->cr[2] = sregs.cr2;
3628     env->cr[3] = sregs.cr3;
3629     env->cr[4] = sregs.cr4;
3630 
3631     env->efer = sregs.efer;
3632 
3633     env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
3634 
3635     if (env->pdptrs_valid) {
3636         for (i = 0; i < 4; i++) {
3637             env->pdptrs[i] = sregs.pdptrs[i];
3638         }
3639     }
3640 
3641     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3642     x86_update_hflags(env);
3643 
3644     return 0;
3645 }
3646 
3647 static int kvm_get_msrs(X86CPU *cpu)
3648 {
3649     CPUX86State *env = &cpu->env;
3650     struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
3651     int ret, i;
3652     uint64_t mtrr_top_bits;
3653 
3654     kvm_msr_buf_reset(cpu);
3655 
3656     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
3657     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
3658     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
3659     kvm_msr_entry_add(cpu, MSR_PAT, 0);
3660     if (has_msr_star) {
3661         kvm_msr_entry_add(cpu, MSR_STAR, 0);
3662     }
3663     if (has_msr_hsave_pa) {
3664         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
3665     }
3666     if (has_msr_tsc_aux) {
3667         kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
3668     }
3669     if (has_msr_tsc_adjust) {
3670         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
3671     }
3672     if (has_msr_tsc_deadline) {
3673         kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
3674     }
3675     if (has_msr_misc_enable) {
3676         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
3677     }
3678     if (has_msr_smbase) {
3679         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
3680     }
3681     if (has_msr_smi_count) {
3682         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
3683     }
3684     if (has_msr_feature_control) {
3685         kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
3686     }
3687     if (has_msr_pkrs) {
3688         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0);
3689     }
3690     if (has_msr_bndcfgs) {
3691         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
3692     }
3693     if (has_msr_xss) {
3694         kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
3695     }
3696     if (has_msr_umwait) {
3697         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0);
3698     }
3699     if (has_msr_spec_ctrl) {
3700         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
3701     }
3702     if (has_tsc_scale_msr) {
3703         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, 0);
3704     }
3705 
3706     if (has_msr_tsx_ctrl) {
3707         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0);
3708     }
3709     if (has_msr_virt_ssbd) {
3710         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
3711     }
3712     if (!env->tsc_valid) {
3713         kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
3714         env->tsc_valid = !runstate_is_running();
3715     }
3716 
3717 #ifdef TARGET_X86_64
3718     if (lm_capable_kernel) {
3719         kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
3720         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
3721         kvm_msr_entry_add(cpu, MSR_FMASK, 0);
3722         kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
3723     }
3724 #endif
3725     kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
3726     kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
3727     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3728         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
3729     }
3730     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3731         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
3732     }
3733     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3734         kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
3735     }
3736     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3737         kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
3738     }
3739     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3740         kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
3741     }
3742     if (has_architectural_pmu_version > 0) {
3743         if (has_architectural_pmu_version > 1) {
3744             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3745             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3746             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
3747             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
3748         }
3749         for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3750             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
3751         }
3752         for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3753             kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
3754             kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
3755         }
3756     }
3757 
3758     if (env->mcg_cap) {
3759         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
3760         kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
3761         if (has_msr_mcg_ext_ctl) {
3762             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
3763         }
3764         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3765             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
3766         }
3767     }
3768 
3769     if (has_msr_hv_hypercall) {
3770         kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
3771         kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
3772     }
3773     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3774         kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
3775     }
3776     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3777         kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
3778     }
3779     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3780         kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
3781         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
3782         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
3783     }
3784     if (has_msr_hv_syndbg_options) {
3785         kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, 0);
3786     }
3787     if (has_msr_hv_crash) {
3788         int j;
3789 
3790         for (j = 0; j < HV_CRASH_PARAMS; j++) {
3791             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
3792         }
3793     }
3794     if (has_msr_hv_runtime) {
3795         kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
3796     }
3797     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3798         uint32_t msr;
3799 
3800         kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
3801         kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
3802         kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
3803         for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
3804             kvm_msr_entry_add(cpu, msr, 0);
3805         }
3806     }
3807     if (has_msr_hv_stimer) {
3808         uint32_t msr;
3809 
3810         for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
3811              msr++) {
3812             kvm_msr_entry_add(cpu, msr, 0);
3813         }
3814     }
3815     if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3816         kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
3817         kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
3818         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
3819         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
3820         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
3821         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
3822         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
3823         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
3824         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
3825         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
3826         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
3827         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
3828         for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3829             kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
3830             kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
3831         }
3832     }
3833 
3834     if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3835         int addr_num =
3836             kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
3837 
3838         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
3839         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
3840         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
3841         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
3842         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
3843         for (i = 0; i < addr_num; i++) {
3844             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
3845         }
3846     }
3847 
3848     if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3849         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0);
3850         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0);
3851         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0);
3852         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
3853     }
3854 
3855     if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
3856         kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0);
3857         kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0);
3858     }
3859 
3860     if (kvm_enabled() && cpu->enable_pmu &&
3861         (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
3862         uint64_t depth;
3863         int i, ret;
3864 
3865         ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
3866         if (ret == 1 && depth == ARCH_LBR_NR_ENTRIES) {
3867             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, 0);
3868             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, 0);
3869 
3870             for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
3871                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i, 0);
3872                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i, 0);
3873                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i, 0);
3874             }
3875         }
3876     }
3877 
3878     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
3879     if (ret < 0) {
3880         return ret;
3881     }
3882 
3883     if (ret < cpu->kvm_msr_buf->nmsrs) {
3884         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
3885         error_report("error: failed to get MSR 0x%" PRIx32,
3886                      (uint32_t)e->index);
3887     }
3888 
3889     assert(ret == cpu->kvm_msr_buf->nmsrs);
3890     /*
3891      * MTRR masks: Each mask consists of 5 parts
3892      * a  10..0: must be zero
3893      * b  11   : valid bit
3894      * c n-1.12: actual mask bits
3895      * d  51..n: reserved must be zero
3896      * e  63.52: reserved must be zero
3897      *
3898      * 'n' is the number of physical bits supported by the CPU and is
3899      * apparently always <= 52.   We know our 'n' but don't know what
3900      * the destinations 'n' is; it might be smaller, in which case
3901      * it masks (c) on loading. It might be larger, in which case
3902      * we fill 'd' so that d..c is consistent irrespetive of the 'n'
3903      * we're migrating to.
3904      */
3905 
3906     if (cpu->fill_mtrr_mask) {
3907         QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
3908         assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
3909         mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
3910     } else {
3911         mtrr_top_bits = 0;
3912     }
3913 
3914     for (i = 0; i < ret; i++) {
3915         uint32_t index = msrs[i].index;
3916         switch (index) {
3917         case MSR_IA32_SYSENTER_CS:
3918             env->sysenter_cs = msrs[i].data;
3919             break;
3920         case MSR_IA32_SYSENTER_ESP:
3921             env->sysenter_esp = msrs[i].data;
3922             break;
3923         case MSR_IA32_SYSENTER_EIP:
3924             env->sysenter_eip = msrs[i].data;
3925             break;
3926         case MSR_PAT:
3927             env->pat = msrs[i].data;
3928             break;
3929         case MSR_STAR:
3930             env->star = msrs[i].data;
3931             break;
3932 #ifdef TARGET_X86_64
3933         case MSR_CSTAR:
3934             env->cstar = msrs[i].data;
3935             break;
3936         case MSR_KERNELGSBASE:
3937             env->kernelgsbase = msrs[i].data;
3938             break;
3939         case MSR_FMASK:
3940             env->fmask = msrs[i].data;
3941             break;
3942         case MSR_LSTAR:
3943             env->lstar = msrs[i].data;
3944             break;
3945 #endif
3946         case MSR_IA32_TSC:
3947             env->tsc = msrs[i].data;
3948             break;
3949         case MSR_TSC_AUX:
3950             env->tsc_aux = msrs[i].data;
3951             break;
3952         case MSR_TSC_ADJUST:
3953             env->tsc_adjust = msrs[i].data;
3954             break;
3955         case MSR_IA32_TSCDEADLINE:
3956             env->tsc_deadline = msrs[i].data;
3957             break;
3958         case MSR_VM_HSAVE_PA:
3959             env->vm_hsave = msrs[i].data;
3960             break;
3961         case MSR_KVM_SYSTEM_TIME:
3962             env->system_time_msr = msrs[i].data;
3963             break;
3964         case MSR_KVM_WALL_CLOCK:
3965             env->wall_clock_msr = msrs[i].data;
3966             break;
3967         case MSR_MCG_STATUS:
3968             env->mcg_status = msrs[i].data;
3969             break;
3970         case MSR_MCG_CTL:
3971             env->mcg_ctl = msrs[i].data;
3972             break;
3973         case MSR_MCG_EXT_CTL:
3974             env->mcg_ext_ctl = msrs[i].data;
3975             break;
3976         case MSR_IA32_MISC_ENABLE:
3977             env->msr_ia32_misc_enable = msrs[i].data;
3978             break;
3979         case MSR_IA32_SMBASE:
3980             env->smbase = msrs[i].data;
3981             break;
3982         case MSR_SMI_COUNT:
3983             env->msr_smi_count = msrs[i].data;
3984             break;
3985         case MSR_IA32_FEATURE_CONTROL:
3986             env->msr_ia32_feature_control = msrs[i].data;
3987             break;
3988         case MSR_IA32_BNDCFGS:
3989             env->msr_bndcfgs = msrs[i].data;
3990             break;
3991         case MSR_IA32_XSS:
3992             env->xss = msrs[i].data;
3993             break;
3994         case MSR_IA32_UMWAIT_CONTROL:
3995             env->umwait = msrs[i].data;
3996             break;
3997         case MSR_IA32_PKRS:
3998             env->pkrs = msrs[i].data;
3999             break;
4000         default:
4001             if (msrs[i].index >= MSR_MC0_CTL &&
4002                 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
4003                 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
4004             }
4005             break;
4006         case MSR_KVM_ASYNC_PF_EN:
4007             env->async_pf_en_msr = msrs[i].data;
4008             break;
4009         case MSR_KVM_ASYNC_PF_INT:
4010             env->async_pf_int_msr = msrs[i].data;
4011             break;
4012         case MSR_KVM_PV_EOI_EN:
4013             env->pv_eoi_en_msr = msrs[i].data;
4014             break;
4015         case MSR_KVM_STEAL_TIME:
4016             env->steal_time_msr = msrs[i].data;
4017             break;
4018         case MSR_KVM_POLL_CONTROL: {
4019             env->poll_control_msr = msrs[i].data;
4020             break;
4021         }
4022         case MSR_CORE_PERF_FIXED_CTR_CTRL:
4023             env->msr_fixed_ctr_ctrl = msrs[i].data;
4024             break;
4025         case MSR_CORE_PERF_GLOBAL_CTRL:
4026             env->msr_global_ctrl = msrs[i].data;
4027             break;
4028         case MSR_CORE_PERF_GLOBAL_STATUS:
4029             env->msr_global_status = msrs[i].data;
4030             break;
4031         case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
4032             env->msr_global_ovf_ctrl = msrs[i].data;
4033             break;
4034         case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
4035             env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
4036             break;
4037         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
4038             env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
4039             break;
4040         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
4041             env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
4042             break;
4043         case HV_X64_MSR_HYPERCALL:
4044             env->msr_hv_hypercall = msrs[i].data;
4045             break;
4046         case HV_X64_MSR_GUEST_OS_ID:
4047             env->msr_hv_guest_os_id = msrs[i].data;
4048             break;
4049         case HV_X64_MSR_APIC_ASSIST_PAGE:
4050             env->msr_hv_vapic = msrs[i].data;
4051             break;
4052         case HV_X64_MSR_REFERENCE_TSC:
4053             env->msr_hv_tsc = msrs[i].data;
4054             break;
4055         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
4056             env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
4057             break;
4058         case HV_X64_MSR_VP_RUNTIME:
4059             env->msr_hv_runtime = msrs[i].data;
4060             break;
4061         case HV_X64_MSR_SCONTROL:
4062             env->msr_hv_synic_control = msrs[i].data;
4063             break;
4064         case HV_X64_MSR_SIEFP:
4065             env->msr_hv_synic_evt_page = msrs[i].data;
4066             break;
4067         case HV_X64_MSR_SIMP:
4068             env->msr_hv_synic_msg_page = msrs[i].data;
4069             break;
4070         case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
4071             env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
4072             break;
4073         case HV_X64_MSR_STIMER0_CONFIG:
4074         case HV_X64_MSR_STIMER1_CONFIG:
4075         case HV_X64_MSR_STIMER2_CONFIG:
4076         case HV_X64_MSR_STIMER3_CONFIG:
4077             env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
4078                                 msrs[i].data;
4079             break;
4080         case HV_X64_MSR_STIMER0_COUNT:
4081         case HV_X64_MSR_STIMER1_COUNT:
4082         case HV_X64_MSR_STIMER2_COUNT:
4083         case HV_X64_MSR_STIMER3_COUNT:
4084             env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
4085                                 msrs[i].data;
4086             break;
4087         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
4088             env->msr_hv_reenlightenment_control = msrs[i].data;
4089             break;
4090         case HV_X64_MSR_TSC_EMULATION_CONTROL:
4091             env->msr_hv_tsc_emulation_control = msrs[i].data;
4092             break;
4093         case HV_X64_MSR_TSC_EMULATION_STATUS:
4094             env->msr_hv_tsc_emulation_status = msrs[i].data;
4095             break;
4096         case HV_X64_MSR_SYNDBG_OPTIONS:
4097             env->msr_hv_syndbg_options = msrs[i].data;
4098             break;
4099         case MSR_MTRRdefType:
4100             env->mtrr_deftype = msrs[i].data;
4101             break;
4102         case MSR_MTRRfix64K_00000:
4103             env->mtrr_fixed[0] = msrs[i].data;
4104             break;
4105         case MSR_MTRRfix16K_80000:
4106             env->mtrr_fixed[1] = msrs[i].data;
4107             break;
4108         case MSR_MTRRfix16K_A0000:
4109             env->mtrr_fixed[2] = msrs[i].data;
4110             break;
4111         case MSR_MTRRfix4K_C0000:
4112             env->mtrr_fixed[3] = msrs[i].data;
4113             break;
4114         case MSR_MTRRfix4K_C8000:
4115             env->mtrr_fixed[4] = msrs[i].data;
4116             break;
4117         case MSR_MTRRfix4K_D0000:
4118             env->mtrr_fixed[5] = msrs[i].data;
4119             break;
4120         case MSR_MTRRfix4K_D8000:
4121             env->mtrr_fixed[6] = msrs[i].data;
4122             break;
4123         case MSR_MTRRfix4K_E0000:
4124             env->mtrr_fixed[7] = msrs[i].data;
4125             break;
4126         case MSR_MTRRfix4K_E8000:
4127             env->mtrr_fixed[8] = msrs[i].data;
4128             break;
4129         case MSR_MTRRfix4K_F0000:
4130             env->mtrr_fixed[9] = msrs[i].data;
4131             break;
4132         case MSR_MTRRfix4K_F8000:
4133             env->mtrr_fixed[10] = msrs[i].data;
4134             break;
4135         case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
4136             if (index & 1) {
4137                 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
4138                                                                mtrr_top_bits;
4139             } else {
4140                 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
4141             }
4142             break;
4143         case MSR_IA32_SPEC_CTRL:
4144             env->spec_ctrl = msrs[i].data;
4145             break;
4146         case MSR_AMD64_TSC_RATIO:
4147             env->amd_tsc_scale_msr = msrs[i].data;
4148             break;
4149         case MSR_IA32_TSX_CTRL:
4150             env->tsx_ctrl = msrs[i].data;
4151             break;
4152         case MSR_VIRT_SSBD:
4153             env->virt_ssbd = msrs[i].data;
4154             break;
4155         case MSR_IA32_RTIT_CTL:
4156             env->msr_rtit_ctrl = msrs[i].data;
4157             break;
4158         case MSR_IA32_RTIT_STATUS:
4159             env->msr_rtit_status = msrs[i].data;
4160             break;
4161         case MSR_IA32_RTIT_OUTPUT_BASE:
4162             env->msr_rtit_output_base = msrs[i].data;
4163             break;
4164         case MSR_IA32_RTIT_OUTPUT_MASK:
4165             env->msr_rtit_output_mask = msrs[i].data;
4166             break;
4167         case MSR_IA32_RTIT_CR3_MATCH:
4168             env->msr_rtit_cr3_match = msrs[i].data;
4169             break;
4170         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
4171             env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
4172             break;
4173         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
4174             env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
4175                            msrs[i].data;
4176             break;
4177         case MSR_IA32_XFD:
4178             env->msr_xfd = msrs[i].data;
4179             break;
4180         case MSR_IA32_XFD_ERR:
4181             env->msr_xfd_err = msrs[i].data;
4182             break;
4183         case MSR_ARCH_LBR_CTL:
4184             env->msr_lbr_ctl = msrs[i].data;
4185             break;
4186         case MSR_ARCH_LBR_DEPTH:
4187             env->msr_lbr_depth = msrs[i].data;
4188             break;
4189         case MSR_ARCH_LBR_FROM_0 ... MSR_ARCH_LBR_FROM_0 + 31:
4190             env->lbr_records[index - MSR_ARCH_LBR_FROM_0].from = msrs[i].data;
4191             break;
4192         case MSR_ARCH_LBR_TO_0 ... MSR_ARCH_LBR_TO_0 + 31:
4193             env->lbr_records[index - MSR_ARCH_LBR_TO_0].to = msrs[i].data;
4194             break;
4195         case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31:
4196             env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data;
4197             break;
4198         }
4199     }
4200 
4201     return 0;
4202 }
4203 
4204 static int kvm_put_mp_state(X86CPU *cpu)
4205 {
4206     struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
4207 
4208     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
4209 }
4210 
4211 static int kvm_get_mp_state(X86CPU *cpu)
4212 {
4213     CPUState *cs = CPU(cpu);
4214     CPUX86State *env = &cpu->env;
4215     struct kvm_mp_state mp_state;
4216     int ret;
4217 
4218     ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
4219     if (ret < 0) {
4220         return ret;
4221     }
4222     env->mp_state = mp_state.mp_state;
4223     if (kvm_irqchip_in_kernel()) {
4224         cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
4225     }
4226     return 0;
4227 }
4228 
4229 static int kvm_get_apic(X86CPU *cpu)
4230 {
4231     DeviceState *apic = cpu->apic_state;
4232     struct kvm_lapic_state kapic;
4233     int ret;
4234 
4235     if (apic && kvm_irqchip_in_kernel()) {
4236         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
4237         if (ret < 0) {
4238             return ret;
4239         }
4240 
4241         kvm_get_apic_state(apic, &kapic);
4242     }
4243     return 0;
4244 }
4245 
4246 static int kvm_put_vcpu_events(X86CPU *cpu, int level)
4247 {
4248     CPUState *cs = CPU(cpu);
4249     CPUX86State *env = &cpu->env;
4250     struct kvm_vcpu_events events = {};
4251 
4252     if (!kvm_has_vcpu_events()) {
4253         return 0;
4254     }
4255 
4256     events.flags = 0;
4257 
4258     if (has_exception_payload) {
4259         events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4260         events.exception.pending = env->exception_pending;
4261         events.exception_has_payload = env->exception_has_payload;
4262         events.exception_payload = env->exception_payload;
4263     }
4264     events.exception.nr = env->exception_nr;
4265     events.exception.injected = env->exception_injected;
4266     events.exception.has_error_code = env->has_error_code;
4267     events.exception.error_code = env->error_code;
4268 
4269     events.interrupt.injected = (env->interrupt_injected >= 0);
4270     events.interrupt.nr = env->interrupt_injected;
4271     events.interrupt.soft = env->soft_interrupt;
4272 
4273     events.nmi.injected = env->nmi_injected;
4274     events.nmi.pending = env->nmi_pending;
4275     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
4276 
4277     events.sipi_vector = env->sipi_vector;
4278 
4279     if (has_msr_smbase) {
4280         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
4281         events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
4282         if (kvm_irqchip_in_kernel()) {
4283             /* As soon as these are moved to the kernel, remove them
4284              * from cs->interrupt_request.
4285              */
4286             events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
4287             events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
4288             cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
4289         } else {
4290             /* Keep these in cs->interrupt_request.  */
4291             events.smi.pending = 0;
4292             events.smi.latched_init = 0;
4293         }
4294         /* Stop SMI delivery on old machine types to avoid a reboot
4295          * on an inward migration of an old VM.
4296          */
4297         if (!cpu->kvm_no_smi_migration) {
4298             events.flags |= KVM_VCPUEVENT_VALID_SMM;
4299         }
4300     }
4301 
4302     if (level >= KVM_PUT_RESET_STATE) {
4303         events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
4304         if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
4305             events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
4306         }
4307     }
4308 
4309     if (has_triple_fault_event) {
4310         events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
4311         events.triple_fault.pending = env->triple_fault_pending;
4312     }
4313 
4314     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
4315 }
4316 
4317 static int kvm_get_vcpu_events(X86CPU *cpu)
4318 {
4319     CPUX86State *env = &cpu->env;
4320     struct kvm_vcpu_events events;
4321     int ret;
4322 
4323     if (!kvm_has_vcpu_events()) {
4324         return 0;
4325     }
4326 
4327     memset(&events, 0, sizeof(events));
4328     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
4329     if (ret < 0) {
4330        return ret;
4331     }
4332 
4333     if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4334         env->exception_pending = events.exception.pending;
4335         env->exception_has_payload = events.exception_has_payload;
4336         env->exception_payload = events.exception_payload;
4337     } else {
4338         env->exception_pending = 0;
4339         env->exception_has_payload = false;
4340     }
4341     env->exception_injected = events.exception.injected;
4342     env->exception_nr =
4343         (env->exception_pending || env->exception_injected) ?
4344         events.exception.nr : -1;
4345     env->has_error_code = events.exception.has_error_code;
4346     env->error_code = events.exception.error_code;
4347 
4348     env->interrupt_injected =
4349         events.interrupt.injected ? events.interrupt.nr : -1;
4350     env->soft_interrupt = events.interrupt.soft;
4351 
4352     env->nmi_injected = events.nmi.injected;
4353     env->nmi_pending = events.nmi.pending;
4354     if (events.nmi.masked) {
4355         env->hflags2 |= HF2_NMI_MASK;
4356     } else {
4357         env->hflags2 &= ~HF2_NMI_MASK;
4358     }
4359 
4360     if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
4361         if (events.smi.smm) {
4362             env->hflags |= HF_SMM_MASK;
4363         } else {
4364             env->hflags &= ~HF_SMM_MASK;
4365         }
4366         if (events.smi.pending) {
4367             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4368         } else {
4369             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4370         }
4371         if (events.smi.smm_inside_nmi) {
4372             env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
4373         } else {
4374             env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
4375         }
4376         if (events.smi.latched_init) {
4377             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4378         } else {
4379             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4380         }
4381     }
4382 
4383     if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
4384         env->triple_fault_pending = events.triple_fault.pending;
4385     }
4386 
4387     env->sipi_vector = events.sipi_vector;
4388 
4389     return 0;
4390 }
4391 
4392 static int kvm_guest_debug_workarounds(X86CPU *cpu)
4393 {
4394     CPUState *cs = CPU(cpu);
4395     CPUX86State *env = &cpu->env;
4396     int ret = 0;
4397     unsigned long reinject_trap = 0;
4398 
4399     if (!kvm_has_vcpu_events()) {
4400         if (env->exception_nr == EXCP01_DB) {
4401             reinject_trap = KVM_GUESTDBG_INJECT_DB;
4402         } else if (env->exception_injected == EXCP03_INT3) {
4403             reinject_trap = KVM_GUESTDBG_INJECT_BP;
4404         }
4405         kvm_reset_exception(env);
4406     }
4407 
4408     /*
4409      * Kernels before KVM_CAP_X86_ROBUST_SINGLESTEP overwrote flags.TF
4410      * injected via SET_GUEST_DEBUG while updating GP regs. Work around this
4411      * by updating the debug state once again if single-stepping is on.
4412      * Another reason to call kvm_update_guest_debug here is a pending debug
4413      * trap raise by the guest. On kernels without SET_VCPU_EVENTS we have to
4414      * reinject them via SET_GUEST_DEBUG.
4415      */
4416     if (reinject_trap ||
4417         (!kvm_has_robust_singlestep() && cs->singlestep_enabled)) {
4418         ret = kvm_update_guest_debug(cs, reinject_trap);
4419     }
4420     return ret;
4421 }
4422 
4423 static int kvm_put_debugregs(X86CPU *cpu)
4424 {
4425     CPUX86State *env = &cpu->env;
4426     struct kvm_debugregs dbgregs;
4427     int i;
4428 
4429     if (!kvm_has_debugregs()) {
4430         return 0;
4431     }
4432 
4433     memset(&dbgregs, 0, sizeof(dbgregs));
4434     for (i = 0; i < 4; i++) {
4435         dbgregs.db[i] = env->dr[i];
4436     }
4437     dbgregs.dr6 = env->dr[6];
4438     dbgregs.dr7 = env->dr[7];
4439     dbgregs.flags = 0;
4440 
4441     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
4442 }
4443 
4444 static int kvm_get_debugregs(X86CPU *cpu)
4445 {
4446     CPUX86State *env = &cpu->env;
4447     struct kvm_debugregs dbgregs;
4448     int i, ret;
4449 
4450     if (!kvm_has_debugregs()) {
4451         return 0;
4452     }
4453 
4454     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
4455     if (ret < 0) {
4456         return ret;
4457     }
4458     for (i = 0; i < 4; i++) {
4459         env->dr[i] = dbgregs.db[i];
4460     }
4461     env->dr[4] = env->dr[6] = dbgregs.dr6;
4462     env->dr[5] = env->dr[7] = dbgregs.dr7;
4463 
4464     return 0;
4465 }
4466 
4467 static int kvm_put_nested_state(X86CPU *cpu)
4468 {
4469     CPUX86State *env = &cpu->env;
4470     int max_nested_state_len = kvm_max_nested_state_length();
4471 
4472     if (!env->nested_state) {
4473         return 0;
4474     }
4475 
4476     /*
4477      * Copy flags that are affected by reset from env->hflags and env->hflags2.
4478      */
4479     if (env->hflags & HF_GUEST_MASK) {
4480         env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
4481     } else {
4482         env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
4483     }
4484 
4485     /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */
4486     if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) {
4487         env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
4488     } else {
4489         env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
4490     }
4491 
4492     assert(env->nested_state->size <= max_nested_state_len);
4493     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
4494 }
4495 
4496 static int kvm_get_nested_state(X86CPU *cpu)
4497 {
4498     CPUX86State *env = &cpu->env;
4499     int max_nested_state_len = kvm_max_nested_state_length();
4500     int ret;
4501 
4502     if (!env->nested_state) {
4503         return 0;
4504     }
4505 
4506     /*
4507      * It is possible that migration restored a smaller size into
4508      * nested_state->hdr.size than what our kernel support.
4509      * We preserve migration origin nested_state->hdr.size for
4510      * call to KVM_SET_NESTED_STATE but wish that our next call
4511      * to KVM_GET_NESTED_STATE will use max size our kernel support.
4512      */
4513     env->nested_state->size = max_nested_state_len;
4514 
4515     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
4516     if (ret < 0) {
4517         return ret;
4518     }
4519 
4520     /*
4521      * Copy flags that are affected by reset to env->hflags and env->hflags2.
4522      */
4523     if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
4524         env->hflags |= HF_GUEST_MASK;
4525     } else {
4526         env->hflags &= ~HF_GUEST_MASK;
4527     }
4528 
4529     /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */
4530     if (cpu_has_svm(env)) {
4531         if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
4532             env->hflags2 |= HF2_GIF_MASK;
4533         } else {
4534             env->hflags2 &= ~HF2_GIF_MASK;
4535         }
4536     }
4537 
4538     return ret;
4539 }
4540 
4541 int kvm_arch_put_registers(CPUState *cpu, int level)
4542 {
4543     X86CPU *x86_cpu = X86_CPU(cpu);
4544     int ret;
4545 
4546     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
4547 
4548     /*
4549      * Put MSR_IA32_FEATURE_CONTROL first, this ensures the VM gets out of VMX
4550      * root operation upon vCPU reset. kvm_put_msr_feature_control() should also
4551      * preceed kvm_put_nested_state() when 'real' nested state is set.
4552      */
4553     if (level >= KVM_PUT_RESET_STATE) {
4554         ret = kvm_put_msr_feature_control(x86_cpu);
4555         if (ret < 0) {
4556             return ret;
4557         }
4558     }
4559 
4560     /* must be before kvm_put_nested_state so that EFER.SVME is set */
4561     ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu);
4562     if (ret < 0) {
4563         return ret;
4564     }
4565 
4566     if (level >= KVM_PUT_RESET_STATE) {
4567         ret = kvm_put_nested_state(x86_cpu);
4568         if (ret < 0) {
4569             return ret;
4570         }
4571     }
4572 
4573     if (level == KVM_PUT_FULL_STATE) {
4574         /* We don't check for kvm_arch_set_tsc_khz() errors here,
4575          * because TSC frequency mismatch shouldn't abort migration,
4576          * unless the user explicitly asked for a more strict TSC
4577          * setting (e.g. using an explicit "tsc-freq" option).
4578          */
4579         kvm_arch_set_tsc_khz(cpu);
4580     }
4581 
4582     ret = kvm_getput_regs(x86_cpu, 1);
4583     if (ret < 0) {
4584         return ret;
4585     }
4586     ret = kvm_put_xsave(x86_cpu);
4587     if (ret < 0) {
4588         return ret;
4589     }
4590     ret = kvm_put_xcrs(x86_cpu);
4591     if (ret < 0) {
4592         return ret;
4593     }
4594     /* must be before kvm_put_msrs */
4595     ret = kvm_inject_mce_oldstyle(x86_cpu);
4596     if (ret < 0) {
4597         return ret;
4598     }
4599     ret = kvm_put_msrs(x86_cpu, level);
4600     if (ret < 0) {
4601         return ret;
4602     }
4603     ret = kvm_put_vcpu_events(x86_cpu, level);
4604     if (ret < 0) {
4605         return ret;
4606     }
4607     if (level >= KVM_PUT_RESET_STATE) {
4608         ret = kvm_put_mp_state(x86_cpu);
4609         if (ret < 0) {
4610             return ret;
4611         }
4612     }
4613 
4614     ret = kvm_put_tscdeadline_msr(x86_cpu);
4615     if (ret < 0) {
4616         return ret;
4617     }
4618     ret = kvm_put_debugregs(x86_cpu);
4619     if (ret < 0) {
4620         return ret;
4621     }
4622     /* must be last */
4623     ret = kvm_guest_debug_workarounds(x86_cpu);
4624     if (ret < 0) {
4625         return ret;
4626     }
4627     return 0;
4628 }
4629 
4630 int kvm_arch_get_registers(CPUState *cs)
4631 {
4632     X86CPU *cpu = X86_CPU(cs);
4633     int ret;
4634 
4635     assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
4636 
4637     ret = kvm_get_vcpu_events(cpu);
4638     if (ret < 0) {
4639         goto out;
4640     }
4641     /*
4642      * KVM_GET_MPSTATE can modify CS and RIP, call it before
4643      * KVM_GET_REGS and KVM_GET_SREGS.
4644      */
4645     ret = kvm_get_mp_state(cpu);
4646     if (ret < 0) {
4647         goto out;
4648     }
4649     ret = kvm_getput_regs(cpu, 0);
4650     if (ret < 0) {
4651         goto out;
4652     }
4653     ret = kvm_get_xsave(cpu);
4654     if (ret < 0) {
4655         goto out;
4656     }
4657     ret = kvm_get_xcrs(cpu);
4658     if (ret < 0) {
4659         goto out;
4660     }
4661     ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu);
4662     if (ret < 0) {
4663         goto out;
4664     }
4665     ret = kvm_get_msrs(cpu);
4666     if (ret < 0) {
4667         goto out;
4668     }
4669     ret = kvm_get_apic(cpu);
4670     if (ret < 0) {
4671         goto out;
4672     }
4673     ret = kvm_get_debugregs(cpu);
4674     if (ret < 0) {
4675         goto out;
4676     }
4677     ret = kvm_get_nested_state(cpu);
4678     if (ret < 0) {
4679         goto out;
4680     }
4681     ret = 0;
4682  out:
4683     cpu_sync_bndcs_hflags(&cpu->env);
4684     return ret;
4685 }
4686 
4687 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
4688 {
4689     X86CPU *x86_cpu = X86_CPU(cpu);
4690     CPUX86State *env = &x86_cpu->env;
4691     int ret;
4692 
4693     /* Inject NMI */
4694     if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
4695         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
4696             qemu_mutex_lock_iothread();
4697             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
4698             qemu_mutex_unlock_iothread();
4699             DPRINTF("injected NMI\n");
4700             ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
4701             if (ret < 0) {
4702                 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
4703                         strerror(-ret));
4704             }
4705         }
4706         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
4707             qemu_mutex_lock_iothread();
4708             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
4709             qemu_mutex_unlock_iothread();
4710             DPRINTF("injected SMI\n");
4711             ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
4712             if (ret < 0) {
4713                 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
4714                         strerror(-ret));
4715             }
4716         }
4717     }
4718 
4719     if (!kvm_pic_in_kernel()) {
4720         qemu_mutex_lock_iothread();
4721     }
4722 
4723     /* Force the VCPU out of its inner loop to process any INIT requests
4724      * or (for userspace APIC, but it is cheap to combine the checks here)
4725      * pending TPR access reports.
4726      */
4727     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
4728         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
4729             !(env->hflags & HF_SMM_MASK)) {
4730             cpu->exit_request = 1;
4731         }
4732         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
4733             cpu->exit_request = 1;
4734         }
4735     }
4736 
4737     if (!kvm_pic_in_kernel()) {
4738         /* Try to inject an interrupt if the guest can accept it */
4739         if (run->ready_for_interrupt_injection &&
4740             (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
4741             (env->eflags & IF_MASK)) {
4742             int irq;
4743 
4744             cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
4745             irq = cpu_get_pic_interrupt(env);
4746             if (irq >= 0) {
4747                 struct kvm_interrupt intr;
4748 
4749                 intr.irq = irq;
4750                 DPRINTF("injected interrupt %d\n", irq);
4751                 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
4752                 if (ret < 0) {
4753                     fprintf(stderr,
4754                             "KVM: injection failed, interrupt lost (%s)\n",
4755                             strerror(-ret));
4756                 }
4757             }
4758         }
4759 
4760         /* If we have an interrupt but the guest is not ready to receive an
4761          * interrupt, request an interrupt window exit.  This will
4762          * cause a return to userspace as soon as the guest is ready to
4763          * receive interrupts. */
4764         if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
4765             run->request_interrupt_window = 1;
4766         } else {
4767             run->request_interrupt_window = 0;
4768         }
4769 
4770         DPRINTF("setting tpr\n");
4771         run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
4772 
4773         qemu_mutex_unlock_iothread();
4774     }
4775 }
4776 
4777 static void kvm_rate_limit_on_bus_lock(void)
4778 {
4779     uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1);
4780 
4781     if (delay_ns) {
4782         g_usleep(delay_ns / SCALE_US);
4783     }
4784 }
4785 
4786 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
4787 {
4788     X86CPU *x86_cpu = X86_CPU(cpu);
4789     CPUX86State *env = &x86_cpu->env;
4790 
4791     if (run->flags & KVM_RUN_X86_SMM) {
4792         env->hflags |= HF_SMM_MASK;
4793     } else {
4794         env->hflags &= ~HF_SMM_MASK;
4795     }
4796     if (run->if_flag) {
4797         env->eflags |= IF_MASK;
4798     } else {
4799         env->eflags &= ~IF_MASK;
4800     }
4801     if (run->flags & KVM_RUN_X86_BUS_LOCK) {
4802         kvm_rate_limit_on_bus_lock();
4803     }
4804 
4805     /* We need to protect the apic state against concurrent accesses from
4806      * different threads in case the userspace irqchip is used. */
4807     if (!kvm_irqchip_in_kernel()) {
4808         qemu_mutex_lock_iothread();
4809     }
4810     cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
4811     cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
4812     if (!kvm_irqchip_in_kernel()) {
4813         qemu_mutex_unlock_iothread();
4814     }
4815     return cpu_get_mem_attrs(env);
4816 }
4817 
4818 int kvm_arch_process_async_events(CPUState *cs)
4819 {
4820     X86CPU *cpu = X86_CPU(cs);
4821     CPUX86State *env = &cpu->env;
4822 
4823     if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
4824         /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
4825         assert(env->mcg_cap);
4826 
4827         cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
4828 
4829         kvm_cpu_synchronize_state(cs);
4830 
4831         if (env->exception_nr == EXCP08_DBLE) {
4832             /* this means triple fault */
4833             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
4834             cs->exit_request = 1;
4835             return 0;
4836         }
4837         kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
4838         env->has_error_code = 0;
4839 
4840         cs->halted = 0;
4841         if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
4842             env->mp_state = KVM_MP_STATE_RUNNABLE;
4843         }
4844     }
4845 
4846     if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
4847         !(env->hflags & HF_SMM_MASK)) {
4848         kvm_cpu_synchronize_state(cs);
4849         do_cpu_init(cpu);
4850     }
4851 
4852     if (kvm_irqchip_in_kernel()) {
4853         return 0;
4854     }
4855 
4856     if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
4857         cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
4858         apic_poll_irq(cpu->apic_state);
4859     }
4860     if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4861          (env->eflags & IF_MASK)) ||
4862         (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4863         cs->halted = 0;
4864     }
4865     if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
4866         kvm_cpu_synchronize_state(cs);
4867         do_cpu_sipi(cpu);
4868     }
4869     if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
4870         cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
4871         kvm_cpu_synchronize_state(cs);
4872         apic_handle_tpr_access_report(cpu->apic_state, env->eip,
4873                                       env->tpr_access_type);
4874     }
4875 
4876     return cs->halted;
4877 }
4878 
4879 static int kvm_handle_halt(X86CPU *cpu)
4880 {
4881     CPUState *cs = CPU(cpu);
4882     CPUX86State *env = &cpu->env;
4883 
4884     if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
4885           (env->eflags & IF_MASK)) &&
4886         !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
4887         cs->halted = 1;
4888         return EXCP_HLT;
4889     }
4890 
4891     return 0;
4892 }
4893 
4894 static int kvm_handle_tpr_access(X86CPU *cpu)
4895 {
4896     CPUState *cs = CPU(cpu);
4897     struct kvm_run *run = cs->kvm_run;
4898 
4899     apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
4900                                   run->tpr_access.is_write ? TPR_ACCESS_WRITE
4901                                                            : TPR_ACCESS_READ);
4902     return 1;
4903 }
4904 
4905 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
4906 {
4907     static const uint8_t int3 = 0xcc;
4908 
4909     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
4910         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
4911         return -EINVAL;
4912     }
4913     return 0;
4914 }
4915 
4916 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
4917 {
4918     uint8_t int3;
4919 
4920     if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) {
4921         return -EINVAL;
4922     }
4923     if (int3 != 0xcc) {
4924         return 0;
4925     }
4926     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
4927         return -EINVAL;
4928     }
4929     return 0;
4930 }
4931 
4932 static struct {
4933     target_ulong addr;
4934     int len;
4935     int type;
4936 } hw_breakpoint[4];
4937 
4938 static int nb_hw_breakpoint;
4939 
4940 static int find_hw_breakpoint(target_ulong addr, int len, int type)
4941 {
4942     int n;
4943 
4944     for (n = 0; n < nb_hw_breakpoint; n++) {
4945         if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
4946             (hw_breakpoint[n].len == len || len == -1)) {
4947             return n;
4948         }
4949     }
4950     return -1;
4951 }
4952 
4953 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
4954                                   target_ulong len, int type)
4955 {
4956     switch (type) {
4957     case GDB_BREAKPOINT_HW:
4958         len = 1;
4959         break;
4960     case GDB_WATCHPOINT_WRITE:
4961     case GDB_WATCHPOINT_ACCESS:
4962         switch (len) {
4963         case 1:
4964             break;
4965         case 2:
4966         case 4:
4967         case 8:
4968             if (addr & (len - 1)) {
4969                 return -EINVAL;
4970             }
4971             break;
4972         default:
4973             return -EINVAL;
4974         }
4975         break;
4976     default:
4977         return -ENOSYS;
4978     }
4979 
4980     if (nb_hw_breakpoint == 4) {
4981         return -ENOBUFS;
4982     }
4983     if (find_hw_breakpoint(addr, len, type) >= 0) {
4984         return -EEXIST;
4985     }
4986     hw_breakpoint[nb_hw_breakpoint].addr = addr;
4987     hw_breakpoint[nb_hw_breakpoint].len = len;
4988     hw_breakpoint[nb_hw_breakpoint].type = type;
4989     nb_hw_breakpoint++;
4990 
4991     return 0;
4992 }
4993 
4994 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
4995                                   target_ulong len, int type)
4996 {
4997     int n;
4998 
4999     n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
5000     if (n < 0) {
5001         return -ENOENT;
5002     }
5003     nb_hw_breakpoint--;
5004     hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
5005 
5006     return 0;
5007 }
5008 
5009 void kvm_arch_remove_all_hw_breakpoints(void)
5010 {
5011     nb_hw_breakpoint = 0;
5012 }
5013 
5014 static CPUWatchpoint hw_watchpoint;
5015 
5016 static int kvm_handle_debug(X86CPU *cpu,
5017                             struct kvm_debug_exit_arch *arch_info)
5018 {
5019     CPUState *cs = CPU(cpu);
5020     CPUX86State *env = &cpu->env;
5021     int ret = 0;
5022     int n;
5023 
5024     if (arch_info->exception == EXCP01_DB) {
5025         if (arch_info->dr6 & DR6_BS) {
5026             if (cs->singlestep_enabled) {
5027                 ret = EXCP_DEBUG;
5028             }
5029         } else {
5030             for (n = 0; n < 4; n++) {
5031                 if (arch_info->dr6 & (1 << n)) {
5032                     switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
5033                     case 0x0:
5034                         ret = EXCP_DEBUG;
5035                         break;
5036                     case 0x1:
5037                         ret = EXCP_DEBUG;
5038                         cs->watchpoint_hit = &hw_watchpoint;
5039                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
5040                         hw_watchpoint.flags = BP_MEM_WRITE;
5041                         break;
5042                     case 0x3:
5043                         ret = EXCP_DEBUG;
5044                         cs->watchpoint_hit = &hw_watchpoint;
5045                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
5046                         hw_watchpoint.flags = BP_MEM_ACCESS;
5047                         break;
5048                     }
5049                 }
5050             }
5051         }
5052     } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
5053         ret = EXCP_DEBUG;
5054     }
5055     if (ret == 0) {
5056         cpu_synchronize_state(cs);
5057         assert(env->exception_nr == -1);
5058 
5059         /* pass to guest */
5060         kvm_queue_exception(env, arch_info->exception,
5061                             arch_info->exception == EXCP01_DB,
5062                             arch_info->dr6);
5063         env->has_error_code = 0;
5064     }
5065 
5066     return ret;
5067 }
5068 
5069 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
5070 {
5071     const uint8_t type_code[] = {
5072         [GDB_BREAKPOINT_HW] = 0x0,
5073         [GDB_WATCHPOINT_WRITE] = 0x1,
5074         [GDB_WATCHPOINT_ACCESS] = 0x3
5075     };
5076     const uint8_t len_code[] = {
5077         [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
5078     };
5079     int n;
5080 
5081     if (kvm_sw_breakpoints_active(cpu)) {
5082         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
5083     }
5084     if (nb_hw_breakpoint > 0) {
5085         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
5086         dbg->arch.debugreg[7] = 0x0600;
5087         for (n = 0; n < nb_hw_breakpoint; n++) {
5088             dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
5089             dbg->arch.debugreg[7] |= (2 << (n * 2)) |
5090                 (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
5091                 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
5092         }
5093     }
5094 }
5095 
5096 static bool has_sgx_provisioning;
5097 
5098 static bool __kvm_enable_sgx_provisioning(KVMState *s)
5099 {
5100     int fd, ret;
5101 
5102     if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) {
5103         return false;
5104     }
5105 
5106     fd = qemu_open_old("/dev/sgx_provision", O_RDONLY);
5107     if (fd < 0) {
5108         return false;
5109     }
5110 
5111     ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd);
5112     if (ret) {
5113         error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret));
5114         exit(1);
5115     }
5116     close(fd);
5117     return true;
5118 }
5119 
5120 bool kvm_enable_sgx_provisioning(KVMState *s)
5121 {
5122     return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning);
5123 }
5124 
5125 static bool host_supports_vmx(void)
5126 {
5127     uint32_t ecx, unused;
5128 
5129     host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
5130     return ecx & CPUID_EXT_VMX;
5131 }
5132 
5133 #define VMX_INVALID_GUEST_STATE 0x80000021
5134 
5135 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
5136 {
5137     X86CPU *cpu = X86_CPU(cs);
5138     uint64_t code;
5139     int ret;
5140 
5141     switch (run->exit_reason) {
5142     case KVM_EXIT_HLT:
5143         DPRINTF("handle_hlt\n");
5144         qemu_mutex_lock_iothread();
5145         ret = kvm_handle_halt(cpu);
5146         qemu_mutex_unlock_iothread();
5147         break;
5148     case KVM_EXIT_SET_TPR:
5149         ret = 0;
5150         break;
5151     case KVM_EXIT_TPR_ACCESS:
5152         qemu_mutex_lock_iothread();
5153         ret = kvm_handle_tpr_access(cpu);
5154         qemu_mutex_unlock_iothread();
5155         break;
5156     case KVM_EXIT_FAIL_ENTRY:
5157         code = run->fail_entry.hardware_entry_failure_reason;
5158         fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
5159                 code);
5160         if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
5161             fprintf(stderr,
5162                     "\nIf you're running a guest on an Intel machine without "
5163                         "unrestricted mode\n"
5164                     "support, the failure can be most likely due to the guest "
5165                         "entering an invalid\n"
5166                     "state for Intel VT. For example, the guest maybe running "
5167                         "in big real mode\n"
5168                     "which is not supported on less recent Intel processors."
5169                         "\n\n");
5170         }
5171         ret = -1;
5172         break;
5173     case KVM_EXIT_EXCEPTION:
5174         fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
5175                 run->ex.exception, run->ex.error_code);
5176         ret = -1;
5177         break;
5178     case KVM_EXIT_DEBUG:
5179         DPRINTF("kvm_exit_debug\n");
5180         qemu_mutex_lock_iothread();
5181         ret = kvm_handle_debug(cpu, &run->debug.arch);
5182         qemu_mutex_unlock_iothread();
5183         break;
5184     case KVM_EXIT_HYPERV:
5185         ret = kvm_hv_handle_exit(cpu, &run->hyperv);
5186         break;
5187     case KVM_EXIT_IOAPIC_EOI:
5188         ioapic_eoi_broadcast(run->eoi.vector);
5189         ret = 0;
5190         break;
5191     case KVM_EXIT_X86_BUS_LOCK:
5192         /* already handled in kvm_arch_post_run */
5193         ret = 0;
5194         break;
5195     default:
5196         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
5197         ret = -1;
5198         break;
5199     }
5200 
5201     return ret;
5202 }
5203 
5204 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
5205 {
5206     X86CPU *cpu = X86_CPU(cs);
5207     CPUX86State *env = &cpu->env;
5208 
5209     kvm_cpu_synchronize_state(cs);
5210     return !(env->cr[0] & CR0_PE_MASK) ||
5211            ((env->segs[R_CS].selector  & 3) != 3);
5212 }
5213 
5214 void kvm_arch_init_irq_routing(KVMState *s)
5215 {
5216     /* We know at this point that we're using the in-kernel
5217      * irqchip, so we can use irqfds, and on x86 we know
5218      * we can use msi via irqfd and GSI routing.
5219      */
5220     kvm_msi_via_irqfd_allowed = true;
5221     kvm_gsi_routing_allowed = true;
5222 
5223     if (kvm_irqchip_is_split()) {
5224         KVMRouteChange c = kvm_irqchip_begin_route_changes(s);
5225         int i;
5226 
5227         /* If the ioapic is in QEMU and the lapics are in KVM, reserve
5228            MSI routes for signaling interrupts to the local apics. */
5229         for (i = 0; i < IOAPIC_NUM_PINS; i++) {
5230             if (kvm_irqchip_add_msi_route(&c, 0, NULL) < 0) {
5231                 error_report("Could not enable split IRQ mode.");
5232                 exit(1);
5233             }
5234         }
5235         kvm_irqchip_commit_route_changes(&c);
5236     }
5237 }
5238 
5239 int kvm_arch_irqchip_create(KVMState *s)
5240 {
5241     int ret;
5242     if (kvm_kernel_irqchip_split()) {
5243         ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
5244         if (ret) {
5245             error_report("Could not enable split irqchip mode: %s",
5246                          strerror(-ret));
5247             exit(1);
5248         } else {
5249             DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
5250             kvm_split_irqchip = true;
5251             return 1;
5252         }
5253     } else {
5254         return 0;
5255     }
5256 }
5257 
5258 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
5259 {
5260     CPUX86State *env;
5261     uint64_t ext_id;
5262 
5263     if (!first_cpu) {
5264         return address;
5265     }
5266     env = &X86_CPU(first_cpu)->env;
5267     if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
5268         return address;
5269     }
5270 
5271     /*
5272      * If the remappable format bit is set, or the upper bits are
5273      * already set in address_hi, or the low extended bits aren't
5274      * there anyway, do nothing.
5275      */
5276     ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
5277     if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) {
5278         return address;
5279     }
5280 
5281     address &= ~ext_id;
5282     address |= ext_id << 35;
5283     return address;
5284 }
5285 
5286 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
5287                              uint64_t address, uint32_t data, PCIDevice *dev)
5288 {
5289     X86IOMMUState *iommu = x86_iommu_get_default();
5290 
5291     if (iommu) {
5292         X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
5293 
5294         if (class->int_remap) {
5295             int ret;
5296             MSIMessage src, dst;
5297 
5298             src.address = route->u.msi.address_hi;
5299             src.address <<= VTD_MSI_ADDR_HI_SHIFT;
5300             src.address |= route->u.msi.address_lo;
5301             src.data = route->u.msi.data;
5302 
5303             ret = class->int_remap(iommu, &src, &dst, dev ?     \
5304                                    pci_requester_id(dev) :      \
5305                                    X86_IOMMU_SID_INVALID);
5306             if (ret) {
5307                 trace_kvm_x86_fixup_msi_error(route->gsi);
5308                 return 1;
5309             }
5310 
5311             /*
5312              * Handled untranslated compatibilty format interrupt with
5313              * extended destination ID in the low bits 11-5. */
5314             dst.address = kvm_swizzle_msi_ext_dest_id(dst.address);
5315 
5316             route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
5317             route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
5318             route->u.msi.data = dst.data;
5319             return 0;
5320         }
5321     }
5322 
5323     address = kvm_swizzle_msi_ext_dest_id(address);
5324     route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
5325     route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
5326     return 0;
5327 }
5328 
5329 typedef struct MSIRouteEntry MSIRouteEntry;
5330 
5331 struct MSIRouteEntry {
5332     PCIDevice *dev;             /* Device pointer */
5333     int vector;                 /* MSI/MSIX vector index */
5334     int virq;                   /* Virtual IRQ index */
5335     QLIST_ENTRY(MSIRouteEntry) list;
5336 };
5337 
5338 /* List of used GSI routes */
5339 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
5340     QLIST_HEAD_INITIALIZER(msi_route_list);
5341 
5342 static void kvm_update_msi_routes_all(void *private, bool global,
5343                                       uint32_t index, uint32_t mask)
5344 {
5345     int cnt = 0, vector;
5346     MSIRouteEntry *entry;
5347     MSIMessage msg;
5348     PCIDevice *dev;
5349 
5350     /* TODO: explicit route update */
5351     QLIST_FOREACH(entry, &msi_route_list, list) {
5352         cnt++;
5353         vector = entry->vector;
5354         dev = entry->dev;
5355         if (msix_enabled(dev) && !msix_is_masked(dev, vector)) {
5356             msg = msix_get_message(dev, vector);
5357         } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) {
5358             msg = msi_get_message(dev, vector);
5359         } else {
5360             /*
5361              * Either MSI/MSIX is disabled for the device, or the
5362              * specific message was masked out.  Skip this one.
5363              */
5364             continue;
5365         }
5366         kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
5367     }
5368     kvm_irqchip_commit_routes(kvm_state);
5369     trace_kvm_x86_update_msi_routes(cnt);
5370 }
5371 
5372 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
5373                                 int vector, PCIDevice *dev)
5374 {
5375     static bool notify_list_inited = false;
5376     MSIRouteEntry *entry;
5377 
5378     if (!dev) {
5379         /* These are (possibly) IOAPIC routes only used for split
5380          * kernel irqchip mode, while what we are housekeeping are
5381          * PCI devices only. */
5382         return 0;
5383     }
5384 
5385     entry = g_new0(MSIRouteEntry, 1);
5386     entry->dev = dev;
5387     entry->vector = vector;
5388     entry->virq = route->gsi;
5389     QLIST_INSERT_HEAD(&msi_route_list, entry, list);
5390 
5391     trace_kvm_x86_add_msi_route(route->gsi);
5392 
5393     if (!notify_list_inited) {
5394         /* For the first time we do add route, add ourselves into
5395          * IOMMU's IEC notify list if needed. */
5396         X86IOMMUState *iommu = x86_iommu_get_default();
5397         if (iommu) {
5398             x86_iommu_iec_register_notifier(iommu,
5399                                             kvm_update_msi_routes_all,
5400                                             NULL);
5401         }
5402         notify_list_inited = true;
5403     }
5404     return 0;
5405 }
5406 
5407 int kvm_arch_release_virq_post(int virq)
5408 {
5409     MSIRouteEntry *entry, *next;
5410     QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
5411         if (entry->virq == virq) {
5412             trace_kvm_x86_remove_msi_route(virq);
5413             QLIST_REMOVE(entry, list);
5414             g_free(entry);
5415             break;
5416         }
5417     }
5418     return 0;
5419 }
5420 
5421 int kvm_arch_msi_data_to_gsi(uint32_t data)
5422 {
5423     abort();
5424 }
5425 
5426 bool kvm_has_waitpkg(void)
5427 {
5428     return has_msr_umwait;
5429 }
5430 
5431 bool kvm_arch_cpu_check_are_resettable(void)
5432 {
5433     return !sev_es_enabled();
5434 }
5435 
5436 #define ARCH_REQ_XCOMP_GUEST_PERM       0x1025
5437 
5438 void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
5439 {
5440     KVMState *s = kvm_state;
5441     uint64_t supported;
5442 
5443     mask &= XSTATE_DYNAMIC_MASK;
5444     if (!mask) {
5445         return;
5446     }
5447     /*
5448      * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0].
5449      * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned
5450      * about them already because they are not supported features.
5451      */
5452     supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
5453     supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32;
5454     mask &= supported;
5455 
5456     while (mask) {
5457         int bit = ctz64(mask);
5458         int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
5459         if (rc) {
5460             /*
5461              * Older kernel version (<5.17) do not support
5462              * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return
5463              * any dynamic feature from kvm_arch_get_supported_cpuid.
5464              */
5465             warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure "
5466                         "for feature bit %d", bit);
5467         }
5468         mask &= ~BIT_ULL(bit);
5469     }
5470 }
5471 
5472 void kvm_arch_accel_class_init(ObjectClass *oc)
5473 {
5474 }
5475