xref: /openbmc/qemu/target/i386/kvm/kvm.c (revision 2f95279a)
1 /*
2  * QEMU KVM support
3  *
4  * Copyright (C) 2006-2008 Qumranet Technologies
5  * Copyright IBM, Corp. 2008
6  *
7  * Authors:
8  *  Anthony Liguori   <aliguori@us.ibm.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
11  * See the COPYING file in the top-level directory.
12  *
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qapi/qapi-events-run-state.h"
17 #include "qapi/error.h"
18 #include "qapi/visitor.h"
19 #include <sys/ioctl.h>
20 #include <sys/utsname.h>
21 #include <sys/syscall.h>
22 
23 #include <linux/kvm.h>
24 #include <linux/kvm_para.h>
25 #include "standard-headers/asm-x86/kvm_para.h"
26 #include "hw/xen/interface/arch-x86/cpuid.h"
27 
28 #include "cpu.h"
29 #include "host-cpu.h"
30 #include "sysemu/sysemu.h"
31 #include "sysemu/hw_accel.h"
32 #include "sysemu/kvm_int.h"
33 #include "sysemu/runstate.h"
34 #include "kvm_i386.h"
35 #include "../confidential-guest.h"
36 #include "sev.h"
37 #include "xen-emu.h"
38 #include "hyperv.h"
39 #include "hyperv-proto.h"
40 
41 #include "gdbstub/enums.h"
42 #include "qemu/host-utils.h"
43 #include "qemu/main-loop.h"
44 #include "qemu/ratelimit.h"
45 #include "qemu/config-file.h"
46 #include "qemu/error-report.h"
47 #include "qemu/memalign.h"
48 #include "hw/i386/x86.h"
49 #include "hw/i386/kvm/xen_evtchn.h"
50 #include "hw/i386/pc.h"
51 #include "hw/i386/apic.h"
52 #include "hw/i386/apic_internal.h"
53 #include "hw/i386/apic-msidef.h"
54 #include "hw/i386/intel_iommu.h"
55 #include "hw/i386/topology.h"
56 #include "hw/i386/x86-iommu.h"
57 #include "hw/i386/e820_memory_layout.h"
58 
59 #include "hw/xen/xen.h"
60 
61 #include "hw/pci/pci.h"
62 #include "hw/pci/msi.h"
63 #include "hw/pci/msix.h"
64 #include "migration/blocker.h"
65 #include "exec/memattrs.h"
66 #include "trace.h"
67 
68 #include CONFIG_DEVICES
69 
70 //#define DEBUG_KVM
71 
72 #ifdef DEBUG_KVM
73 #define DPRINTF(fmt, ...) \
74     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
75 #else
76 #define DPRINTF(fmt, ...) \
77     do { } while (0)
78 #endif
79 
80 /* From arch/x86/kvm/lapic.h */
81 #define KVM_APIC_BUS_CYCLE_NS       1
82 #define KVM_APIC_BUS_FREQUENCY      (1000000000ULL / KVM_APIC_BUS_CYCLE_NS)
83 
84 #define MSR_KVM_WALL_CLOCK  0x11
85 #define MSR_KVM_SYSTEM_TIME 0x12
86 
87 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
88  * 255 kvm_msr_entry structs */
89 #define MSR_BUF_SIZE 4096
90 
91 static void kvm_init_msrs(X86CPU *cpu);
92 
93 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
94     KVM_CAP_INFO(SET_TSS_ADDR),
95     KVM_CAP_INFO(EXT_CPUID),
96     KVM_CAP_INFO(MP_STATE),
97     KVM_CAP_INFO(SIGNAL_MSI),
98     KVM_CAP_INFO(IRQ_ROUTING),
99     KVM_CAP_INFO(DEBUGREGS),
100     KVM_CAP_INFO(XSAVE),
101     KVM_CAP_INFO(VCPU_EVENTS),
102     KVM_CAP_INFO(X86_ROBUST_SINGLESTEP),
103     KVM_CAP_INFO(MCE),
104     KVM_CAP_INFO(ADJUST_CLOCK),
105     KVM_CAP_INFO(SET_IDENTITY_MAP_ADDR),
106     KVM_CAP_LAST_INFO
107 };
108 
109 static bool has_msr_star;
110 static bool has_msr_hsave_pa;
111 static bool has_msr_tsc_aux;
112 static bool has_msr_tsc_adjust;
113 static bool has_msr_tsc_deadline;
114 static bool has_msr_feature_control;
115 static bool has_msr_misc_enable;
116 static bool has_msr_smbase;
117 static bool has_msr_bndcfgs;
118 static int lm_capable_kernel;
119 static bool has_msr_hv_hypercall;
120 static bool has_msr_hv_crash;
121 static bool has_msr_hv_reset;
122 static bool has_msr_hv_vpindex;
123 static bool hv_vpindex_settable;
124 static bool has_msr_hv_runtime;
125 static bool has_msr_hv_synic;
126 static bool has_msr_hv_stimer;
127 static bool has_msr_hv_frequencies;
128 static bool has_msr_hv_reenlightenment;
129 static bool has_msr_hv_syndbg_options;
130 static bool has_msr_xss;
131 static bool has_msr_umwait;
132 static bool has_msr_spec_ctrl;
133 static bool has_tsc_scale_msr;
134 static bool has_msr_tsx_ctrl;
135 static bool has_msr_virt_ssbd;
136 static bool has_msr_smi_count;
137 static bool has_msr_arch_capabs;
138 static bool has_msr_core_capabs;
139 static bool has_msr_vmx_vmfunc;
140 static bool has_msr_ucode_rev;
141 static bool has_msr_vmx_procbased_ctls2;
142 static bool has_msr_perf_capabs;
143 static bool has_msr_pkrs;
144 
145 static uint32_t has_architectural_pmu_version;
146 static uint32_t num_architectural_pmu_gp_counters;
147 static uint32_t num_architectural_pmu_fixed_counters;
148 
149 static int has_xsave2;
150 static int has_xcrs;
151 static int has_sregs2;
152 static int has_exception_payload;
153 static int has_triple_fault_event;
154 
155 static bool has_msr_mcg_ext_ctl;
156 
157 static struct kvm_cpuid2 *cpuid_cache;
158 static struct kvm_cpuid2 *hv_cpuid_cache;
159 static struct kvm_msr_list *kvm_feature_msrs;
160 
161 static KVMMSRHandlers msr_handlers[KVM_MSR_FILTER_MAX_RANGES];
162 
163 #define BUS_LOCK_SLICE_TIME 1000000000ULL /* ns */
164 static RateLimit bus_lock_ratelimit_ctrl;
165 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value);
166 
167 static const char *vm_type_name[] = {
168     [KVM_X86_DEFAULT_VM] = "default",
169     [KVM_X86_SEV_VM] = "SEV",
170     [KVM_X86_SEV_ES_VM] = "SEV-ES",
171     [KVM_X86_SNP_VM] = "SEV-SNP",
172 };
173 
174 bool kvm_is_vm_type_supported(int type)
175 {
176     uint32_t machine_types;
177 
178     /*
179      * old KVM doesn't support KVM_CAP_VM_TYPES but KVM_X86_DEFAULT_VM
180      * is always supported
181      */
182     if (type == KVM_X86_DEFAULT_VM) {
183         return true;
184     }
185 
186     machine_types = kvm_check_extension(KVM_STATE(current_machine->accelerator),
187                                         KVM_CAP_VM_TYPES);
188     return !!(machine_types & BIT(type));
189 }
190 
191 int kvm_get_vm_type(MachineState *ms)
192 {
193     int kvm_type = KVM_X86_DEFAULT_VM;
194 
195     if (ms->cgs) {
196         if (!object_dynamic_cast(OBJECT(ms->cgs), TYPE_X86_CONFIDENTIAL_GUEST)) {
197             error_report("configuration type %s not supported for x86 guests",
198                          object_get_typename(OBJECT(ms->cgs)));
199             exit(1);
200         }
201         kvm_type = x86_confidential_guest_kvm_type(
202             X86_CONFIDENTIAL_GUEST(ms->cgs));
203     }
204 
205     if (!kvm_is_vm_type_supported(kvm_type)) {
206         error_report("vm-type %s not supported by KVM", vm_type_name[kvm_type]);
207         exit(1);
208     }
209 
210     return kvm_type;
211 }
212 
213 bool kvm_enable_hypercall(uint64_t enable_mask)
214 {
215     KVMState *s = KVM_STATE(current_accel());
216 
217     return !kvm_vm_enable_cap(s, KVM_CAP_EXIT_HYPERCALL, 0, enable_mask);
218 }
219 
220 bool kvm_has_smm(void)
221 {
222     return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM);
223 }
224 
225 bool kvm_has_adjust_clock_stable(void)
226 {
227     int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
228 
229     return (ret & KVM_CLOCK_TSC_STABLE);
230 }
231 
232 bool kvm_has_exception_payload(void)
233 {
234     return has_exception_payload;
235 }
236 
237 static bool kvm_x2apic_api_set_flags(uint64_t flags)
238 {
239     KVMState *s = KVM_STATE(current_accel());
240 
241     return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
242 }
243 
244 #define MEMORIZE(fn, _result) \
245     ({ \
246         static bool _memorized; \
247         \
248         if (_memorized) { \
249             return _result; \
250         } \
251         _memorized = true; \
252         _result = fn; \
253     })
254 
255 static bool has_x2apic_api;
256 
257 bool kvm_has_x2apic_api(void)
258 {
259     return has_x2apic_api;
260 }
261 
262 bool kvm_enable_x2apic(void)
263 {
264     return MEMORIZE(
265              kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
266                                       KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
267              has_x2apic_api);
268 }
269 
270 bool kvm_hv_vpindex_settable(void)
271 {
272     return hv_vpindex_settable;
273 }
274 
275 static int kvm_get_tsc(CPUState *cs)
276 {
277     X86CPU *cpu = X86_CPU(cs);
278     CPUX86State *env = &cpu->env;
279     uint64_t value;
280     int ret;
281 
282     if (env->tsc_valid) {
283         return 0;
284     }
285 
286     env->tsc_valid = !runstate_is_running();
287 
288     ret = kvm_get_one_msr(cpu, MSR_IA32_TSC, &value);
289     if (ret < 0) {
290         return ret;
291     }
292 
293     env->tsc = value;
294     return 0;
295 }
296 
297 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
298 {
299     kvm_get_tsc(cpu);
300 }
301 
302 void kvm_synchronize_all_tsc(void)
303 {
304     CPUState *cpu;
305 
306     if (kvm_enabled()) {
307         CPU_FOREACH(cpu) {
308             run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
309         }
310     }
311 }
312 
313 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
314 {
315     struct kvm_cpuid2 *cpuid;
316     int r, size;
317 
318     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
319     cpuid = g_malloc0(size);
320     cpuid->nent = max;
321     r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
322     if (r == 0 && cpuid->nent >= max) {
323         r = -E2BIG;
324     }
325     if (r < 0) {
326         if (r == -E2BIG) {
327             g_free(cpuid);
328             return NULL;
329         } else {
330             fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
331                     strerror(-r));
332             exit(1);
333         }
334     }
335     return cpuid;
336 }
337 
338 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
339  * for all entries.
340  */
341 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
342 {
343     struct kvm_cpuid2 *cpuid;
344     int max = 1;
345 
346     if (cpuid_cache != NULL) {
347         return cpuid_cache;
348     }
349     while ((cpuid = try_get_cpuid(s, max)) == NULL) {
350         max *= 2;
351     }
352     cpuid_cache = cpuid;
353     return cpuid;
354 }
355 
356 static bool host_tsx_broken(void)
357 {
358     int family, model, stepping;\
359     char vendor[CPUID_VENDOR_SZ + 1];
360 
361     host_cpu_vendor_fms(vendor, &family, &model, &stepping);
362 
363     /* Check if we are running on a Haswell host known to have broken TSX */
364     return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
365            (family == 6) &&
366            ((model == 63 && stepping < 4) ||
367             model == 60 || model == 69 || model == 70);
368 }
369 
370 /* Returns the value for a specific register on the cpuid entry
371  */
372 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
373 {
374     uint32_t ret = 0;
375     switch (reg) {
376     case R_EAX:
377         ret = entry->eax;
378         break;
379     case R_EBX:
380         ret = entry->ebx;
381         break;
382     case R_ECX:
383         ret = entry->ecx;
384         break;
385     case R_EDX:
386         ret = entry->edx;
387         break;
388     }
389     return ret;
390 }
391 
392 /* Find matching entry for function/index on kvm_cpuid2 struct
393  */
394 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
395                                                  uint32_t function,
396                                                  uint32_t index)
397 {
398     int i;
399     for (i = 0; i < cpuid->nent; ++i) {
400         if (cpuid->entries[i].function == function &&
401             cpuid->entries[i].index == index) {
402             return &cpuid->entries[i];
403         }
404     }
405     /* not found: */
406     return NULL;
407 }
408 
409 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
410                                       uint32_t index, int reg)
411 {
412     struct kvm_cpuid2 *cpuid;
413     uint32_t ret = 0;
414     uint32_t cpuid_1_edx, unused;
415     uint64_t bitmask;
416 
417     cpuid = get_supported_cpuid(s);
418 
419     struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
420     if (entry) {
421         ret = cpuid_entry_get_reg(entry, reg);
422     }
423 
424     /* Fixups for the data returned by KVM, below */
425 
426     if (function == 1 && reg == R_EDX) {
427         /* KVM before 2.6.30 misreports the following features */
428         ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
429         /* KVM never reports CPUID_HT but QEMU can support when vcpus > 1 */
430         ret |= CPUID_HT;
431     } else if (function == 1 && reg == R_ECX) {
432         /* We can set the hypervisor flag, even if KVM does not return it on
433          * GET_SUPPORTED_CPUID
434          */
435         ret |= CPUID_EXT_HYPERVISOR;
436         /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
437          * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
438          * and the irqchip is in the kernel.
439          */
440         if (kvm_irqchip_in_kernel() &&
441                 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
442             ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
443         }
444 
445         /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
446          * without the in-kernel irqchip
447          */
448         if (!kvm_irqchip_in_kernel()) {
449             ret &= ~CPUID_EXT_X2APIC;
450         }
451 
452         if (enable_cpu_pm) {
453             int disable_exits = kvm_check_extension(s,
454                                                     KVM_CAP_X86_DISABLE_EXITS);
455 
456             if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
457                 ret |= CPUID_EXT_MONITOR;
458             }
459         }
460     } else if (function == 6 && reg == R_EAX) {
461         ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
462     } else if (function == 7 && index == 0 && reg == R_EBX) {
463         /* Not new instructions, just an optimization.  */
464         uint32_t ebx;
465         host_cpuid(7, 0, &unused, &ebx, &unused, &unused);
466         ret |= ebx & CPUID_7_0_EBX_ERMS;
467 
468         if (host_tsx_broken()) {
469             ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
470         }
471     } else if (function == 7 && index == 0 && reg == R_EDX) {
472         /* Not new instructions, just an optimization.  */
473         uint32_t edx;
474         host_cpuid(7, 0, &unused, &unused, &unused, &edx);
475         ret |= edx & CPUID_7_0_EDX_FSRM;
476 
477         /*
478          * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
479          * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
480          * returned by KVM_GET_MSR_INDEX_LIST.
481          */
482         if (!has_msr_arch_capabs) {
483             ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
484         }
485     } else if (function == 7 && index == 1 && reg == R_EAX) {
486         /* Not new instructions, just an optimization.  */
487         uint32_t eax;
488         host_cpuid(7, 1, &eax, &unused, &unused, &unused);
489         ret |= eax & (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | CPUID_7_1_EAX_FSRC);
490     } else if (function == 7 && index == 2 && reg == R_EDX) {
491         uint32_t edx;
492         host_cpuid(7, 2, &unused, &unused, &unused, &edx);
493         ret |= edx & CPUID_7_2_EDX_MCDT_NO;
494     } else if (function == 0xd && index == 0 &&
495                (reg == R_EAX || reg == R_EDX)) {
496         /*
497          * The value returned by KVM_GET_SUPPORTED_CPUID does not include
498          * features that still have to be enabled with the arch_prctl
499          * system call.  QEMU needs the full value, which is retrieved
500          * with KVM_GET_DEVICE_ATTR.
501          */
502         struct kvm_device_attr attr = {
503             .group = 0,
504             .attr = KVM_X86_XCOMP_GUEST_SUPP,
505             .addr = (unsigned long) &bitmask
506         };
507 
508         bool sys_attr = kvm_check_extension(s, KVM_CAP_SYS_ATTRIBUTES);
509         if (!sys_attr) {
510             return ret;
511         }
512 
513         int rc = kvm_ioctl(s, KVM_GET_DEVICE_ATTR, &attr);
514         if (rc < 0) {
515             if (rc != -ENXIO) {
516                 warn_report("KVM_GET_DEVICE_ATTR(0, KVM_X86_XCOMP_GUEST_SUPP) "
517                             "error: %d", rc);
518             }
519             return ret;
520         }
521         ret = (reg == R_EAX) ? bitmask : bitmask >> 32;
522     } else if (function == 0x80000001 && reg == R_ECX) {
523         /*
524          * It's safe to enable TOPOEXT even if it's not returned by
525          * GET_SUPPORTED_CPUID.  Unconditionally enabling TOPOEXT here allows
526          * us to keep CPU models including TOPOEXT runnable on older kernels.
527          */
528         ret |= CPUID_EXT3_TOPOEXT;
529     } else if (function == 0x80000001 && reg == R_EDX) {
530         /* On Intel, kvm returns cpuid according to the Intel spec,
531          * so add missing bits according to the AMD spec:
532          */
533         cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
534         ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
535     } else if (function == 0x80000007 && reg == R_EBX) {
536         ret |= CPUID_8000_0007_EBX_OVERFLOW_RECOV | CPUID_8000_0007_EBX_SUCCOR;
537     } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
538         /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
539          * be enabled without the in-kernel irqchip
540          */
541         if (!kvm_irqchip_in_kernel()) {
542             ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
543         }
544         if (kvm_irqchip_is_split()) {
545             ret |= 1U << KVM_FEATURE_MSI_EXT_DEST_ID;
546         }
547     } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
548         ret |= 1U << KVM_HINTS_REALTIME;
549     }
550 
551     return ret;
552 }
553 
554 uint64_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
555 {
556     struct {
557         struct kvm_msrs info;
558         struct kvm_msr_entry entries[1];
559     } msr_data = {};
560     uint64_t value;
561     uint32_t ret, can_be_one, must_be_one;
562 
563     if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
564         return 0;
565     }
566 
567     /* Check if requested MSR is supported feature MSR */
568     int i;
569     for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
570         if (kvm_feature_msrs->indices[i] == index) {
571             break;
572         }
573     if (i == kvm_feature_msrs->nmsrs) {
574         return 0; /* if the feature MSR is not supported, simply return 0 */
575     }
576 
577     msr_data.info.nmsrs = 1;
578     msr_data.entries[0].index = index;
579 
580     ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
581     if (ret != 1) {
582         error_report("KVM get MSR (index=0x%x) feature failed, %s",
583             index, strerror(-ret));
584         exit(1);
585     }
586 
587     value = msr_data.entries[0].data;
588     switch (index) {
589     case MSR_IA32_VMX_PROCBASED_CTLS2:
590         if (!has_msr_vmx_procbased_ctls2) {
591             /* KVM forgot to add these bits for some time, do this ourselves. */
592             if (kvm_arch_get_supported_cpuid(s, 0xD, 1, R_ECX) &
593                 CPUID_XSAVE_XSAVES) {
594                 value |= (uint64_t)VMX_SECONDARY_EXEC_XSAVES << 32;
595             }
596             if (kvm_arch_get_supported_cpuid(s, 1, 0, R_ECX) &
597                 CPUID_EXT_RDRAND) {
598                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDRAND_EXITING << 32;
599             }
600             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
601                 CPUID_7_0_EBX_INVPCID) {
602                 value |= (uint64_t)VMX_SECONDARY_EXEC_ENABLE_INVPCID << 32;
603             }
604             if (kvm_arch_get_supported_cpuid(s, 7, 0, R_EBX) &
605                 CPUID_7_0_EBX_RDSEED) {
606                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDSEED_EXITING << 32;
607             }
608             if (kvm_arch_get_supported_cpuid(s, 0x80000001, 0, R_EDX) &
609                 CPUID_EXT2_RDTSCP) {
610                 value |= (uint64_t)VMX_SECONDARY_EXEC_RDTSCP << 32;
611             }
612         }
613         /* fall through */
614     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
615     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
616     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
617     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
618         /*
619          * Return true for bits that can be one, but do not have to be one.
620          * The SDM tells us which bits could have a "must be one" setting,
621          * so we can do the opposite transformation in make_vmx_msr_value.
622          */
623         must_be_one = (uint32_t)value;
624         can_be_one = (uint32_t)(value >> 32);
625         return can_be_one & ~must_be_one;
626 
627     default:
628         return value;
629     }
630 }
631 
632 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
633                                      int *max_banks)
634 {
635     *max_banks = kvm_check_extension(s, KVM_CAP_MCE);
636     return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
637 }
638 
639 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
640 {
641     CPUState *cs = CPU(cpu);
642     CPUX86State *env = &cpu->env;
643     uint64_t status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_MISCV |
644                       MCI_STATUS_ADDRV;
645     uint64_t mcg_status = MCG_STATUS_MCIP | MCG_STATUS_RIPV;
646     int flags = 0;
647 
648     if (!IS_AMD_CPU(env)) {
649         status |= MCI_STATUS_S | MCI_STATUS_UC;
650         if (code == BUS_MCEERR_AR) {
651             status |= MCI_STATUS_AR | 0x134;
652             mcg_status |= MCG_STATUS_EIPV;
653         } else {
654             status |= 0xc0;
655         }
656     } else {
657         if (code == BUS_MCEERR_AR) {
658             status |= MCI_STATUS_UC | MCI_STATUS_POISON;
659             mcg_status |= MCG_STATUS_EIPV;
660         } else {
661             /* Setting the POISON bit for deferred errors indicates to the
662              * guest kernel that the address provided by the MCE is valid
663              * and usable which will ensure that the guest kernel will send
664              * a SIGBUS_AO signal to the guest process. This allows for
665              * more desirable behavior in the case that the guest process
666              * with poisoned memory has set the MCE_KILL_EARLY prctl flag
667              * which indicates that the process would prefer to handle or
668              * shutdown due to the poisoned memory condition before the
669              * memory has been accessed.
670              *
671              * While the POISON bit would not be set in a deferred error
672              * sent from hardware, the bit is not meaningful for deferred
673              * errors and can be reused in this scenario.
674              */
675             status |= MCI_STATUS_DEFERRED | MCI_STATUS_POISON;
676         }
677     }
678 
679     flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
680     /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
681      * guest kernel back into env->mcg_ext_ctl.
682      */
683     cpu_synchronize_state(cs);
684     if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
685         mcg_status |= MCG_STATUS_LMCE;
686         flags = 0;
687     }
688 
689     cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
690                        (MCM_ADDR_PHYS << 6) | 0xc, flags);
691 }
692 
693 static void emit_hypervisor_memory_failure(MemoryFailureAction action, bool ar)
694 {
695     MemoryFailureFlags mff = {.action_required = ar, .recursive = false};
696 
697     qapi_event_send_memory_failure(MEMORY_FAILURE_RECIPIENT_HYPERVISOR, action,
698                                    &mff);
699 }
700 
701 static void hardware_memory_error(void *host_addr)
702 {
703     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_FATAL, true);
704     error_report("QEMU got Hardware memory error at addr %p", host_addr);
705     exit(1);
706 }
707 
708 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
709 {
710     X86CPU *cpu = X86_CPU(c);
711     CPUX86State *env = &cpu->env;
712     ram_addr_t ram_addr;
713     hwaddr paddr;
714 
715     /* If we get an action required MCE, it has been injected by KVM
716      * while the VM was running.  An action optional MCE instead should
717      * be coming from the main thread, which qemu_init_sigbus identifies
718      * as the "early kill" thread.
719      */
720     assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
721 
722     if ((env->mcg_cap & MCG_SER_P) && addr) {
723         ram_addr = qemu_ram_addr_from_host(addr);
724         if (ram_addr != RAM_ADDR_INVALID &&
725             kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
726             kvm_hwpoison_page_add(ram_addr);
727             kvm_mce_inject(cpu, paddr, code);
728 
729             /*
730              * Use different logging severity based on error type.
731              * If there is additional MCE reporting on the hypervisor, QEMU VA
732              * could be another source to identify the PA and MCE details.
733              */
734             if (code == BUS_MCEERR_AR) {
735                 error_report("Guest MCE Memory Error at QEMU addr %p and "
736                     "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
737                     addr, paddr, "BUS_MCEERR_AR");
738             } else {
739                  warn_report("Guest MCE Memory Error at QEMU addr %p and "
740                      "GUEST addr 0x%" HWADDR_PRIx " of type %s injected",
741                      addr, paddr, "BUS_MCEERR_AO");
742             }
743 
744             return;
745         }
746 
747         if (code == BUS_MCEERR_AO) {
748             warn_report("Hardware memory error at addr %p of type %s "
749                 "for memory used by QEMU itself instead of guest system!",
750                  addr, "BUS_MCEERR_AO");
751         }
752     }
753 
754     if (code == BUS_MCEERR_AR) {
755         hardware_memory_error(addr);
756     }
757 
758     /* Hope we are lucky for AO MCE, just notify a event */
759     emit_hypervisor_memory_failure(MEMORY_FAILURE_ACTION_IGNORE, false);
760 }
761 
762 static void kvm_queue_exception(CPUX86State *env,
763                                 int32_t exception_nr,
764                                 uint8_t exception_has_payload,
765                                 uint64_t exception_payload)
766 {
767     assert(env->exception_nr == -1);
768     assert(!env->exception_pending);
769     assert(!env->exception_injected);
770     assert(!env->exception_has_payload);
771 
772     env->exception_nr = exception_nr;
773 
774     if (has_exception_payload) {
775         env->exception_pending = 1;
776 
777         env->exception_has_payload = exception_has_payload;
778         env->exception_payload = exception_payload;
779     } else {
780         env->exception_injected = 1;
781 
782         if (exception_nr == EXCP01_DB) {
783             assert(exception_has_payload);
784             env->dr[6] = exception_payload;
785         } else if (exception_nr == EXCP0E_PAGE) {
786             assert(exception_has_payload);
787             env->cr[2] = exception_payload;
788         } else {
789             assert(!exception_has_payload);
790         }
791     }
792 }
793 
794 static void cpu_update_state(void *opaque, bool running, RunState state)
795 {
796     CPUX86State *env = opaque;
797 
798     if (running) {
799         env->tsc_valid = false;
800     }
801 }
802 
803 unsigned long kvm_arch_vcpu_id(CPUState *cs)
804 {
805     X86CPU *cpu = X86_CPU(cs);
806     return cpu->apic_id;
807 }
808 
809 #ifndef KVM_CPUID_SIGNATURE_NEXT
810 #define KVM_CPUID_SIGNATURE_NEXT                0x40000100
811 #endif
812 
813 static bool hyperv_enabled(X86CPU *cpu)
814 {
815     return kvm_check_extension(kvm_state, KVM_CAP_HYPERV) > 0 &&
816         ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_NOTIFY) ||
817          cpu->hyperv_features || cpu->hyperv_passthrough);
818 }
819 
820 /*
821  * Check whether target_freq is within conservative
822  * ntp correctable bounds (250ppm) of freq
823  */
824 static inline bool freq_within_bounds(int freq, int target_freq)
825 {
826         int max_freq = freq + (freq * 250 / 1000000);
827         int min_freq = freq - (freq * 250 / 1000000);
828 
829         if (target_freq >= min_freq && target_freq <= max_freq) {
830                 return true;
831         }
832 
833         return false;
834 }
835 
836 static int kvm_arch_set_tsc_khz(CPUState *cs)
837 {
838     X86CPU *cpu = X86_CPU(cs);
839     CPUX86State *env = &cpu->env;
840     int r, cur_freq;
841     bool set_ioctl = false;
842 
843     if (!env->tsc_khz) {
844         return 0;
845     }
846 
847     cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
848                kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) : -ENOTSUP;
849 
850     /*
851      * If TSC scaling is supported, attempt to set TSC frequency.
852      */
853     if (kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL)) {
854         set_ioctl = true;
855     }
856 
857     /*
858      * If desired TSC frequency is within bounds of NTP correction,
859      * attempt to set TSC frequency.
860      */
861     if (cur_freq != -ENOTSUP && freq_within_bounds(cur_freq, env->tsc_khz)) {
862         set_ioctl = true;
863     }
864 
865     r = set_ioctl ?
866         kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
867         -ENOTSUP;
868 
869     if (r < 0) {
870         /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
871          * TSC frequency doesn't match the one we want.
872          */
873         cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
874                    kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
875                    -ENOTSUP;
876         if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
877             warn_report("TSC frequency mismatch between "
878                         "VM (%" PRId64 " kHz) and host (%d kHz), "
879                         "and TSC scaling unavailable",
880                         env->tsc_khz, cur_freq);
881             return r;
882         }
883     }
884 
885     return 0;
886 }
887 
888 static bool tsc_is_stable_and_known(CPUX86State *env)
889 {
890     if (!env->tsc_khz) {
891         return false;
892     }
893     return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
894         || env->user_tsc_khz;
895 }
896 
897 #define DEFAULT_EVMCS_VERSION ((1 << 8) | 1)
898 
899 static struct {
900     const char *desc;
901     struct {
902         uint32_t func;
903         int reg;
904         uint32_t bits;
905     } flags[2];
906     uint64_t dependencies;
907 } kvm_hyperv_properties[] = {
908     [HYPERV_FEAT_RELAXED] = {
909         .desc = "relaxed timing (hv-relaxed)",
910         .flags = {
911             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
912              .bits = HV_RELAXED_TIMING_RECOMMENDED}
913         }
914     },
915     [HYPERV_FEAT_VAPIC] = {
916         .desc = "virtual APIC (hv-vapic)",
917         .flags = {
918             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
919              .bits = HV_APIC_ACCESS_AVAILABLE}
920         }
921     },
922     [HYPERV_FEAT_TIME] = {
923         .desc = "clocksources (hv-time)",
924         .flags = {
925             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
926              .bits = HV_TIME_REF_COUNT_AVAILABLE | HV_REFERENCE_TSC_AVAILABLE}
927         }
928     },
929     [HYPERV_FEAT_CRASH] = {
930         .desc = "crash MSRs (hv-crash)",
931         .flags = {
932             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
933              .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
934         }
935     },
936     [HYPERV_FEAT_RESET] = {
937         .desc = "reset MSR (hv-reset)",
938         .flags = {
939             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
940              .bits = HV_RESET_AVAILABLE}
941         }
942     },
943     [HYPERV_FEAT_VPINDEX] = {
944         .desc = "VP_INDEX MSR (hv-vpindex)",
945         .flags = {
946             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
947              .bits = HV_VP_INDEX_AVAILABLE}
948         }
949     },
950     [HYPERV_FEAT_RUNTIME] = {
951         .desc = "VP_RUNTIME MSR (hv-runtime)",
952         .flags = {
953             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
954              .bits = HV_VP_RUNTIME_AVAILABLE}
955         }
956     },
957     [HYPERV_FEAT_SYNIC] = {
958         .desc = "synthetic interrupt controller (hv-synic)",
959         .flags = {
960             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
961              .bits = HV_SYNIC_AVAILABLE}
962         }
963     },
964     [HYPERV_FEAT_STIMER] = {
965         .desc = "synthetic timers (hv-stimer)",
966         .flags = {
967             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
968              .bits = HV_SYNTIMERS_AVAILABLE}
969         },
970         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
971     },
972     [HYPERV_FEAT_FREQUENCIES] = {
973         .desc = "frequency MSRs (hv-frequencies)",
974         .flags = {
975             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
976              .bits = HV_ACCESS_FREQUENCY_MSRS},
977             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
978              .bits = HV_FREQUENCY_MSRS_AVAILABLE}
979         }
980     },
981     [HYPERV_FEAT_REENLIGHTENMENT] = {
982         .desc = "reenlightenment MSRs (hv-reenlightenment)",
983         .flags = {
984             {.func = HV_CPUID_FEATURES, .reg = R_EAX,
985              .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
986         }
987     },
988     [HYPERV_FEAT_TLBFLUSH] = {
989         .desc = "paravirtualized TLB flush (hv-tlbflush)",
990         .flags = {
991             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
992              .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
993              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
994         },
995         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
996     },
997     [HYPERV_FEAT_EVMCS] = {
998         .desc = "enlightened VMCS (hv-evmcs)",
999         .flags = {
1000             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
1001              .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
1002         },
1003         .dependencies = BIT(HYPERV_FEAT_VAPIC)
1004     },
1005     [HYPERV_FEAT_IPI] = {
1006         .desc = "paravirtualized IPI (hv-ipi)",
1007         .flags = {
1008             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
1009              .bits = HV_CLUSTER_IPI_RECOMMENDED |
1010              HV_EX_PROCESSOR_MASKS_RECOMMENDED}
1011         },
1012         .dependencies = BIT(HYPERV_FEAT_VPINDEX)
1013     },
1014     [HYPERV_FEAT_STIMER_DIRECT] = {
1015         .desc = "direct mode synthetic timers (hv-stimer-direct)",
1016         .flags = {
1017             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1018              .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
1019         },
1020         .dependencies = BIT(HYPERV_FEAT_STIMER)
1021     },
1022     [HYPERV_FEAT_AVIC] = {
1023         .desc = "AVIC/APICv support (hv-avic/hv-apicv)",
1024         .flags = {
1025             {.func = HV_CPUID_ENLIGHTMENT_INFO, .reg = R_EAX,
1026              .bits = HV_DEPRECATING_AEOI_RECOMMENDED}
1027         }
1028     },
1029 #ifdef CONFIG_SYNDBG
1030     [HYPERV_FEAT_SYNDBG] = {
1031         .desc = "Enable synthetic kernel debugger channel (hv-syndbg)",
1032         .flags = {
1033             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1034              .bits = HV_FEATURE_DEBUG_MSRS_AVAILABLE}
1035         },
1036         .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_RELAXED)
1037     },
1038 #endif
1039     [HYPERV_FEAT_MSR_BITMAP] = {
1040         .desc = "enlightened MSR-Bitmap (hv-emsr-bitmap)",
1041         .flags = {
1042             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
1043              .bits = HV_NESTED_MSR_BITMAP}
1044         }
1045     },
1046     [HYPERV_FEAT_XMM_INPUT] = {
1047         .desc = "XMM fast hypercall input (hv-xmm-input)",
1048         .flags = {
1049             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1050              .bits = HV_HYPERCALL_XMM_INPUT_AVAILABLE}
1051         }
1052     },
1053     [HYPERV_FEAT_TLBFLUSH_EXT] = {
1054         .desc = "Extended gva ranges for TLB flush hypercalls (hv-tlbflush-ext)",
1055         .flags = {
1056             {.func = HV_CPUID_FEATURES, .reg = R_EDX,
1057              .bits = HV_EXT_GVA_RANGES_FLUSH_AVAILABLE}
1058         },
1059         .dependencies = BIT(HYPERV_FEAT_TLBFLUSH)
1060     },
1061     [HYPERV_FEAT_TLBFLUSH_DIRECT] = {
1062         .desc = "direct TLB flush (hv-tlbflush-direct)",
1063         .flags = {
1064             {.func = HV_CPUID_NESTED_FEATURES, .reg = R_EAX,
1065              .bits = HV_NESTED_DIRECT_FLUSH}
1066         },
1067         .dependencies = BIT(HYPERV_FEAT_VAPIC)
1068     },
1069 };
1070 
1071 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max,
1072                                            bool do_sys_ioctl)
1073 {
1074     struct kvm_cpuid2 *cpuid;
1075     int r, size;
1076 
1077     size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
1078     cpuid = g_malloc0(size);
1079     cpuid->nent = max;
1080 
1081     if (do_sys_ioctl) {
1082         r = kvm_ioctl(kvm_state, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1083     } else {
1084         r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
1085     }
1086     if (r == 0 && cpuid->nent >= max) {
1087         r = -E2BIG;
1088     }
1089     if (r < 0) {
1090         if (r == -E2BIG) {
1091             g_free(cpuid);
1092             return NULL;
1093         } else {
1094             fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
1095                     strerror(-r));
1096             exit(1);
1097         }
1098     }
1099     return cpuid;
1100 }
1101 
1102 /*
1103  * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
1104  * for all entries.
1105  */
1106 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
1107 {
1108     struct kvm_cpuid2 *cpuid;
1109     /* 0x40000000..0x40000005, 0x4000000A, 0x40000080..0x40000082 leaves */
1110     int max = 11;
1111     int i;
1112     bool do_sys_ioctl;
1113 
1114     do_sys_ioctl =
1115         kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID) > 0;
1116 
1117     /*
1118      * Non-empty KVM context is needed when KVM_CAP_SYS_HYPERV_CPUID is
1119      * unsupported, kvm_hyperv_expand_features() checks for that.
1120      */
1121     assert(do_sys_ioctl || cs->kvm_state);
1122 
1123     /*
1124      * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
1125      * -E2BIG, however, it doesn't report back the right size. Keep increasing
1126      * it and re-trying until we succeed.
1127      */
1128     while ((cpuid = try_get_hv_cpuid(cs, max, do_sys_ioctl)) == NULL) {
1129         max++;
1130     }
1131 
1132     /*
1133      * KVM_GET_SUPPORTED_HV_CPUID does not set EVMCS CPUID bit before
1134      * KVM_CAP_HYPERV_ENLIGHTENED_VMCS is enabled but we want to get the
1135      * information early, just check for the capability and set the bit
1136      * manually.
1137      */
1138     if (!do_sys_ioctl && kvm_check_extension(cs->kvm_state,
1139                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1140         for (i = 0; i < cpuid->nent; i++) {
1141             if (cpuid->entries[i].function == HV_CPUID_ENLIGHTMENT_INFO) {
1142                 cpuid->entries[i].eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1143             }
1144         }
1145     }
1146 
1147     return cpuid;
1148 }
1149 
1150 /*
1151  * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
1152  * leaves from KVM_CAP_HYPERV* and present MSRs data.
1153  */
1154 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
1155 {
1156     X86CPU *cpu = X86_CPU(cs);
1157     struct kvm_cpuid2 *cpuid;
1158     struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
1159 
1160     /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
1161     cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
1162     cpuid->nent = 2;
1163 
1164     /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
1165     entry_feat = &cpuid->entries[0];
1166     entry_feat->function = HV_CPUID_FEATURES;
1167 
1168     entry_recomm = &cpuid->entries[1];
1169     entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
1170     entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
1171 
1172     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
1173         entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
1174         entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
1175         entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1176         entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
1177         entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
1178     }
1179 
1180     if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
1181         entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
1182         entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
1183     }
1184 
1185     if (has_msr_hv_frequencies) {
1186         entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
1187         entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
1188     }
1189 
1190     if (has_msr_hv_crash) {
1191         entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
1192     }
1193 
1194     if (has_msr_hv_reenlightenment) {
1195         entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
1196     }
1197 
1198     if (has_msr_hv_reset) {
1199         entry_feat->eax |= HV_RESET_AVAILABLE;
1200     }
1201 
1202     if (has_msr_hv_vpindex) {
1203         entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
1204     }
1205 
1206     if (has_msr_hv_runtime) {
1207         entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
1208     }
1209 
1210     if (has_msr_hv_synic) {
1211         unsigned int cap = cpu->hyperv_synic_kvm_only ?
1212             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1213 
1214         if (kvm_check_extension(cs->kvm_state, cap) > 0) {
1215             entry_feat->eax |= HV_SYNIC_AVAILABLE;
1216         }
1217     }
1218 
1219     if (has_msr_hv_stimer) {
1220         entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
1221     }
1222 
1223     if (has_msr_hv_syndbg_options) {
1224         entry_feat->edx |= HV_GUEST_DEBUGGING_AVAILABLE;
1225         entry_feat->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE;
1226         entry_feat->ebx |= HV_PARTITION_DEBUGGING_ALLOWED;
1227     }
1228 
1229     if (kvm_check_extension(cs->kvm_state,
1230                             KVM_CAP_HYPERV_TLBFLUSH) > 0) {
1231         entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
1232         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1233     }
1234 
1235     if (kvm_check_extension(cs->kvm_state,
1236                             KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
1237         entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
1238     }
1239 
1240     if (kvm_check_extension(cs->kvm_state,
1241                             KVM_CAP_HYPERV_SEND_IPI) > 0) {
1242         entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
1243         entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
1244     }
1245 
1246     return cpuid;
1247 }
1248 
1249 static uint32_t hv_cpuid_get_host(CPUState *cs, uint32_t func, int reg)
1250 {
1251     struct kvm_cpuid_entry2 *entry;
1252     struct kvm_cpuid2 *cpuid;
1253 
1254     if (hv_cpuid_cache) {
1255         cpuid = hv_cpuid_cache;
1256     } else {
1257         if (kvm_check_extension(kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
1258             cpuid = get_supported_hv_cpuid(cs);
1259         } else {
1260             /*
1261              * 'cs->kvm_state' may be NULL when Hyper-V features are expanded
1262              * before KVM context is created but this is only done when
1263              * KVM_CAP_SYS_HYPERV_CPUID is supported and it implies
1264              * KVM_CAP_HYPERV_CPUID.
1265              */
1266             assert(cs->kvm_state);
1267 
1268             cpuid = get_supported_hv_cpuid_legacy(cs);
1269         }
1270         hv_cpuid_cache = cpuid;
1271     }
1272 
1273     if (!cpuid) {
1274         return 0;
1275     }
1276 
1277     entry = cpuid_find_entry(cpuid, func, 0);
1278     if (!entry) {
1279         return 0;
1280     }
1281 
1282     return cpuid_entry_get_reg(entry, reg);
1283 }
1284 
1285 static bool hyperv_feature_supported(CPUState *cs, int feature)
1286 {
1287     uint32_t func, bits;
1288     int i, reg;
1289 
1290     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
1291 
1292         func = kvm_hyperv_properties[feature].flags[i].func;
1293         reg = kvm_hyperv_properties[feature].flags[i].reg;
1294         bits = kvm_hyperv_properties[feature].flags[i].bits;
1295 
1296         if (!func) {
1297             continue;
1298         }
1299 
1300         if ((hv_cpuid_get_host(cs, func, reg) & bits) != bits) {
1301             return false;
1302         }
1303     }
1304 
1305     return true;
1306 }
1307 
1308 /* Checks that all feature dependencies are enabled */
1309 static bool hv_feature_check_deps(X86CPU *cpu, int feature, Error **errp)
1310 {
1311     uint64_t deps;
1312     int dep_feat;
1313 
1314     deps = kvm_hyperv_properties[feature].dependencies;
1315     while (deps) {
1316         dep_feat = ctz64(deps);
1317         if (!(hyperv_feat_enabled(cpu, dep_feat))) {
1318             error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1319                        kvm_hyperv_properties[feature].desc,
1320                        kvm_hyperv_properties[dep_feat].desc);
1321             return false;
1322         }
1323         deps &= ~(1ull << dep_feat);
1324     }
1325 
1326     return true;
1327 }
1328 
1329 static uint32_t hv_build_cpuid_leaf(CPUState *cs, uint32_t func, int reg)
1330 {
1331     X86CPU *cpu = X86_CPU(cs);
1332     uint32_t r = 0;
1333     int i, j;
1334 
1335     for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties); i++) {
1336         if (!hyperv_feat_enabled(cpu, i)) {
1337             continue;
1338         }
1339 
1340         for (j = 0; j < ARRAY_SIZE(kvm_hyperv_properties[i].flags); j++) {
1341             if (kvm_hyperv_properties[i].flags[j].func != func) {
1342                 continue;
1343             }
1344             if (kvm_hyperv_properties[i].flags[j].reg != reg) {
1345                 continue;
1346             }
1347 
1348             r |= kvm_hyperv_properties[i].flags[j].bits;
1349         }
1350     }
1351 
1352     /* HV_CPUID_NESTED_FEATURES.EAX also encodes the supported eVMCS range */
1353     if (func == HV_CPUID_NESTED_FEATURES && reg == R_EAX) {
1354         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1355             r |= DEFAULT_EVMCS_VERSION;
1356         }
1357     }
1358 
1359     return r;
1360 }
1361 
1362 /*
1363  * Expand Hyper-V CPU features. In partucular, check that all the requested
1364  * features are supported by the host and the sanity of the configuration
1365  * (that all the required dependencies are included). Also, this takes care
1366  * of 'hv_passthrough' mode and fills the environment with all supported
1367  * Hyper-V features.
1368  */
1369 bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp)
1370 {
1371     CPUState *cs = CPU(cpu);
1372     Error *local_err = NULL;
1373     int feat;
1374 
1375     if (!hyperv_enabled(cpu))
1376         return true;
1377 
1378     /*
1379      * When kvm_hyperv_expand_features is called at CPU feature expansion
1380      * time per-CPU kvm_state is not available yet so we can only proceed
1381      * when KVM_CAP_SYS_HYPERV_CPUID is supported.
1382      */
1383     if (!cs->kvm_state &&
1384         !kvm_check_extension(kvm_state, KVM_CAP_SYS_HYPERV_CPUID))
1385         return true;
1386 
1387     if (cpu->hyperv_passthrough) {
1388         cpu->hyperv_vendor_id[0] =
1389             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EBX);
1390         cpu->hyperv_vendor_id[1] =
1391             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_ECX);
1392         cpu->hyperv_vendor_id[2] =
1393             hv_cpuid_get_host(cs, HV_CPUID_VENDOR_AND_MAX_FUNCTIONS, R_EDX);
1394         cpu->hyperv_vendor = g_realloc(cpu->hyperv_vendor,
1395                                        sizeof(cpu->hyperv_vendor_id) + 1);
1396         memcpy(cpu->hyperv_vendor, cpu->hyperv_vendor_id,
1397                sizeof(cpu->hyperv_vendor_id));
1398         cpu->hyperv_vendor[sizeof(cpu->hyperv_vendor_id)] = 0;
1399 
1400         cpu->hyperv_interface_id[0] =
1401             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EAX);
1402         cpu->hyperv_interface_id[1] =
1403             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EBX);
1404         cpu->hyperv_interface_id[2] =
1405             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_ECX);
1406         cpu->hyperv_interface_id[3] =
1407             hv_cpuid_get_host(cs, HV_CPUID_INTERFACE, R_EDX);
1408 
1409         cpu->hyperv_ver_id_build =
1410             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EAX);
1411         cpu->hyperv_ver_id_major =
1412             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) >> 16;
1413         cpu->hyperv_ver_id_minor =
1414             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EBX) & 0xffff;
1415         cpu->hyperv_ver_id_sp =
1416             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_ECX);
1417         cpu->hyperv_ver_id_sb =
1418             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) >> 24;
1419         cpu->hyperv_ver_id_sn =
1420             hv_cpuid_get_host(cs, HV_CPUID_VERSION, R_EDX) & 0xffffff;
1421 
1422         cpu->hv_max_vps = hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS,
1423                                             R_EAX);
1424         cpu->hyperv_limits[0] =
1425             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EBX);
1426         cpu->hyperv_limits[1] =
1427             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_ECX);
1428         cpu->hyperv_limits[2] =
1429             hv_cpuid_get_host(cs, HV_CPUID_IMPLEMENT_LIMITS, R_EDX);
1430 
1431         cpu->hyperv_spinlock_attempts =
1432             hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EBX);
1433 
1434         /*
1435          * Mark feature as enabled in 'cpu->hyperv_features' as
1436          * hv_build_cpuid_leaf() uses this info to build guest CPUIDs.
1437          */
1438         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1439             if (hyperv_feature_supported(cs, feat)) {
1440                 cpu->hyperv_features |= BIT(feat);
1441             }
1442         }
1443     } else {
1444         /* Check features availability and dependencies */
1445         for (feat = 0; feat < ARRAY_SIZE(kvm_hyperv_properties); feat++) {
1446             /* If the feature was not requested skip it. */
1447             if (!hyperv_feat_enabled(cpu, feat)) {
1448                 continue;
1449             }
1450 
1451             /* Check if the feature is supported by KVM */
1452             if (!hyperv_feature_supported(cs, feat)) {
1453                 error_setg(errp, "Hyper-V %s is not supported by kernel",
1454                            kvm_hyperv_properties[feat].desc);
1455                 return false;
1456             }
1457 
1458             /* Check dependencies */
1459             if (!hv_feature_check_deps(cpu, feat, &local_err)) {
1460                 error_propagate(errp, local_err);
1461                 return false;
1462             }
1463         }
1464     }
1465 
1466     /* Additional dependencies not covered by kvm_hyperv_properties[] */
1467     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1468         !cpu->hyperv_synic_kvm_only &&
1469         !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
1470         error_setg(errp, "Hyper-V %s requires Hyper-V %s",
1471                    kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
1472                    kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
1473         return false;
1474     }
1475 
1476     return true;
1477 }
1478 
1479 /*
1480  * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent.
1481  */
1482 static int hyperv_fill_cpuids(CPUState *cs,
1483                               struct kvm_cpuid_entry2 *cpuid_ent)
1484 {
1485     X86CPU *cpu = X86_CPU(cs);
1486     struct kvm_cpuid_entry2 *c;
1487     uint32_t signature[3];
1488     uint32_t cpuid_i = 0, max_cpuid_leaf = 0;
1489     uint32_t nested_eax =
1490         hv_build_cpuid_leaf(cs, HV_CPUID_NESTED_FEATURES, R_EAX);
1491 
1492     max_cpuid_leaf = nested_eax ? HV_CPUID_NESTED_FEATURES :
1493         HV_CPUID_IMPLEMENT_LIMITS;
1494 
1495     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
1496         max_cpuid_leaf =
1497             MAX(max_cpuid_leaf, HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
1498     }
1499 
1500     c = &cpuid_ent[cpuid_i++];
1501     c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
1502     c->eax = max_cpuid_leaf;
1503     c->ebx = cpu->hyperv_vendor_id[0];
1504     c->ecx = cpu->hyperv_vendor_id[1];
1505     c->edx = cpu->hyperv_vendor_id[2];
1506 
1507     c = &cpuid_ent[cpuid_i++];
1508     c->function = HV_CPUID_INTERFACE;
1509     c->eax = cpu->hyperv_interface_id[0];
1510     c->ebx = cpu->hyperv_interface_id[1];
1511     c->ecx = cpu->hyperv_interface_id[2];
1512     c->edx = cpu->hyperv_interface_id[3];
1513 
1514     c = &cpuid_ent[cpuid_i++];
1515     c->function = HV_CPUID_VERSION;
1516     c->eax = cpu->hyperv_ver_id_build;
1517     c->ebx = (uint32_t)cpu->hyperv_ver_id_major << 16 |
1518         cpu->hyperv_ver_id_minor;
1519     c->ecx = cpu->hyperv_ver_id_sp;
1520     c->edx = (uint32_t)cpu->hyperv_ver_id_sb << 24 |
1521         (cpu->hyperv_ver_id_sn & 0xffffff);
1522 
1523     c = &cpuid_ent[cpuid_i++];
1524     c->function = HV_CPUID_FEATURES;
1525     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EAX);
1526     c->ebx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EBX);
1527     c->edx = hv_build_cpuid_leaf(cs, HV_CPUID_FEATURES, R_EDX);
1528 
1529     /* Unconditionally required with any Hyper-V enlightenment */
1530     c->eax |= HV_HYPERCALL_AVAILABLE;
1531 
1532     /* SynIC and Vmbus devices require messages/signals hypercalls */
1533     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1534         !cpu->hyperv_synic_kvm_only) {
1535         c->ebx |= HV_POST_MESSAGES | HV_SIGNAL_EVENTS;
1536     }
1537 
1538 
1539     /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
1540     c->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1541 
1542     c = &cpuid_ent[cpuid_i++];
1543     c->function = HV_CPUID_ENLIGHTMENT_INFO;
1544     c->eax = hv_build_cpuid_leaf(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX);
1545     c->ebx = cpu->hyperv_spinlock_attempts;
1546 
1547     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
1548         !hyperv_feat_enabled(cpu, HYPERV_FEAT_AVIC)) {
1549         c->eax |= HV_APIC_ACCESS_RECOMMENDED;
1550     }
1551 
1552     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_ON) {
1553         c->eax |= HV_NO_NONARCH_CORESHARING;
1554     } else if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO) {
1555         c->eax |= hv_cpuid_get_host(cs, HV_CPUID_ENLIGHTMENT_INFO, R_EAX) &
1556             HV_NO_NONARCH_CORESHARING;
1557     }
1558 
1559     c = &cpuid_ent[cpuid_i++];
1560     c->function = HV_CPUID_IMPLEMENT_LIMITS;
1561     c->eax = cpu->hv_max_vps;
1562     c->ebx = cpu->hyperv_limits[0];
1563     c->ecx = cpu->hyperv_limits[1];
1564     c->edx = cpu->hyperv_limits[2];
1565 
1566     if (nested_eax) {
1567         uint32_t function;
1568 
1569         /* Create zeroed 0x40000006..0x40000009 leaves */
1570         for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
1571              function < HV_CPUID_NESTED_FEATURES; function++) {
1572             c = &cpuid_ent[cpuid_i++];
1573             c->function = function;
1574         }
1575 
1576         c = &cpuid_ent[cpuid_i++];
1577         c->function = HV_CPUID_NESTED_FEATURES;
1578         c->eax = nested_eax;
1579     }
1580 
1581     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG)) {
1582         c = &cpuid_ent[cpuid_i++];
1583         c->function = HV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS;
1584         c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
1585             HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
1586         memcpy(signature, "Microsoft VS", 12);
1587         c->eax = 0;
1588         c->ebx = signature[0];
1589         c->ecx = signature[1];
1590         c->edx = signature[2];
1591 
1592         c = &cpuid_ent[cpuid_i++];
1593         c->function = HV_CPUID_SYNDBG_INTERFACE;
1594         memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12);
1595         c->eax = signature[0];
1596         c->ebx = 0;
1597         c->ecx = 0;
1598         c->edx = 0;
1599 
1600         c = &cpuid_ent[cpuid_i++];
1601         c->function = HV_CPUID_SYNDBG_PLATFORM_CAPABILITIES;
1602         c->eax = HV_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING;
1603         c->ebx = 0;
1604         c->ecx = 0;
1605         c->edx = 0;
1606     }
1607 
1608     return cpuid_i;
1609 }
1610 
1611 static Error *hv_passthrough_mig_blocker;
1612 static Error *hv_no_nonarch_cs_mig_blocker;
1613 
1614 /* Checks that the exposed eVMCS version range is supported by KVM */
1615 static bool evmcs_version_supported(uint16_t evmcs_version,
1616                                     uint16_t supported_evmcs_version)
1617 {
1618     uint8_t min_version = evmcs_version & 0xff;
1619     uint8_t max_version = evmcs_version >> 8;
1620     uint8_t min_supported_version = supported_evmcs_version & 0xff;
1621     uint8_t max_supported_version = supported_evmcs_version >> 8;
1622 
1623     return (min_version >= min_supported_version) &&
1624         (max_version <= max_supported_version);
1625 }
1626 
1627 static int hyperv_init_vcpu(X86CPU *cpu)
1628 {
1629     CPUState *cs = CPU(cpu);
1630     Error *local_err = NULL;
1631     int ret;
1632 
1633     if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
1634         error_setg(&hv_passthrough_mig_blocker,
1635                    "'hv-passthrough' CPU flag prevents migration, use explicit"
1636                    " set of hv-* flags instead");
1637         ret = migrate_add_blocker(&hv_passthrough_mig_blocker, &local_err);
1638         if (ret < 0) {
1639             error_report_err(local_err);
1640             return ret;
1641         }
1642     }
1643 
1644     if (cpu->hyperv_no_nonarch_cs == ON_OFF_AUTO_AUTO &&
1645         hv_no_nonarch_cs_mig_blocker == NULL) {
1646         error_setg(&hv_no_nonarch_cs_mig_blocker,
1647                    "'hv-no-nonarch-coresharing=auto' CPU flag prevents migration"
1648                    " use explicit 'hv-no-nonarch-coresharing=on' instead (but"
1649                    " make sure SMT is disabled and/or that vCPUs are properly"
1650                    " pinned)");
1651         ret = migrate_add_blocker(&hv_no_nonarch_cs_mig_blocker, &local_err);
1652         if (ret < 0) {
1653             error_report_err(local_err);
1654             return ret;
1655         }
1656     }
1657 
1658     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
1659         /*
1660          * the kernel doesn't support setting vp_index; assert that its value
1661          * is in sync
1662          */
1663         uint64_t value;
1664 
1665         ret = kvm_get_one_msr(cpu, HV_X64_MSR_VP_INDEX, &value);
1666         if (ret < 0) {
1667             return ret;
1668         }
1669 
1670         if (value != hyperv_vp_index(CPU(cpu))) {
1671             error_report("kernel's vp_index != QEMU's vp_index");
1672             return -ENXIO;
1673         }
1674     }
1675 
1676     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1677         uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
1678             KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1679         ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
1680         if (ret < 0) {
1681             error_report("failed to turn on HyperV SynIC in KVM: %s",
1682                          strerror(-ret));
1683             return ret;
1684         }
1685 
1686         if (!cpu->hyperv_synic_kvm_only) {
1687             ret = hyperv_x86_synic_add(cpu);
1688             if (ret < 0) {
1689                 error_report("failed to create HyperV SynIC: %s",
1690                              strerror(-ret));
1691                 return ret;
1692             }
1693         }
1694     }
1695 
1696     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1697         uint16_t evmcs_version = DEFAULT_EVMCS_VERSION;
1698         uint16_t supported_evmcs_version;
1699 
1700         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
1701                                   (uintptr_t)&supported_evmcs_version);
1702 
1703         /*
1704          * KVM is required to support EVMCS ver.1. as that's what 'hv-evmcs'
1705          * option sets. Note: we hardcode the maximum supported eVMCS version
1706          * to '1' as well so 'hv-evmcs' feature is migratable even when (and if)
1707          * ver.2 is implemented. A new option (e.g. 'hv-evmcs=2') will then have
1708          * to be added.
1709          */
1710         if (ret < 0) {
1711             error_report("Hyper-V %s is not supported by kernel",
1712                          kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
1713             return ret;
1714         }
1715 
1716         if (!evmcs_version_supported(evmcs_version, supported_evmcs_version)) {
1717             error_report("eVMCS version range [%d..%d] is not supported by "
1718                          "kernel (supported: [%d..%d])", evmcs_version & 0xff,
1719                          evmcs_version >> 8, supported_evmcs_version & 0xff,
1720                          supported_evmcs_version >> 8);
1721             return -ENOTSUP;
1722         }
1723     }
1724 
1725     if (cpu->hyperv_enforce_cpuid) {
1726         ret = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENFORCE_CPUID, 0, 1);
1727         if (ret < 0) {
1728             error_report("failed to enable KVM_CAP_HYPERV_ENFORCE_CPUID: %s",
1729                          strerror(-ret));
1730             return ret;
1731         }
1732     }
1733 
1734     /* Skip SynIC and VP_INDEX since they are hard deps already */
1735     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_STIMER) &&
1736         hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC) &&
1737         hyperv_feat_enabled(cpu, HYPERV_FEAT_RUNTIME)) {
1738         hyperv_x86_set_vmbus_recommended_features_enabled();
1739     }
1740 
1741     return 0;
1742 }
1743 
1744 static Error *invtsc_mig_blocker;
1745 
1746 #define KVM_MAX_CPUID_ENTRIES  100
1747 
1748 static void kvm_init_xsave(CPUX86State *env)
1749 {
1750     if (has_xsave2) {
1751         env->xsave_buf_len = QEMU_ALIGN_UP(has_xsave2, 4096);
1752     } else {
1753         env->xsave_buf_len = sizeof(struct kvm_xsave);
1754     }
1755 
1756     env->xsave_buf = qemu_memalign(4096, env->xsave_buf_len);
1757     memset(env->xsave_buf, 0, env->xsave_buf_len);
1758     /*
1759      * The allocated storage must be large enough for all of the
1760      * possible XSAVE state components.
1761      */
1762     assert(kvm_arch_get_supported_cpuid(kvm_state, 0xd, 0, R_ECX) <=
1763            env->xsave_buf_len);
1764 }
1765 
1766 static void kvm_init_nested_state(CPUX86State *env)
1767 {
1768     struct kvm_vmx_nested_state_hdr *vmx_hdr;
1769     uint32_t size;
1770 
1771     if (!env->nested_state) {
1772         return;
1773     }
1774 
1775     size = env->nested_state->size;
1776 
1777     memset(env->nested_state, 0, size);
1778     env->nested_state->size = size;
1779 
1780     if (cpu_has_vmx(env)) {
1781         env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
1782         vmx_hdr = &env->nested_state->hdr.vmx;
1783         vmx_hdr->vmxon_pa = -1ull;
1784         vmx_hdr->vmcs12_pa = -1ull;
1785     } else if (cpu_has_svm(env)) {
1786         env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM;
1787     }
1788 }
1789 
1790 static uint32_t kvm_x86_build_cpuid(CPUX86State *env,
1791                                     struct kvm_cpuid_entry2 *entries,
1792                                     uint32_t cpuid_i)
1793 {
1794     uint32_t limit, i, j;
1795     uint32_t unused;
1796     struct kvm_cpuid_entry2 *c;
1797 
1798     cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
1799 
1800     for (i = 0; i <= limit; i++) {
1801         j = 0;
1802         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1803             goto full;
1804         }
1805         c = &entries[cpuid_i++];
1806         switch (i) {
1807         case 2: {
1808             /* Keep reading function 2 till all the input is received */
1809             int times;
1810 
1811             c->function = i;
1812             c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
1813                        KVM_CPUID_FLAG_STATE_READ_NEXT;
1814             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1815             times = c->eax & 0xff;
1816 
1817             for (j = 1; j < times; ++j) {
1818                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1819                     goto full;
1820                 }
1821                 c = &entries[cpuid_i++];
1822                 c->function = i;
1823                 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
1824                 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1825             }
1826             break;
1827         }
1828         case 0x1f:
1829             if (!x86_has_extended_topo(env->avail_cpu_topo)) {
1830                 cpuid_i--;
1831                 break;
1832             }
1833             /* fallthrough */
1834         case 4:
1835         case 0xb:
1836         case 0xd:
1837             for (j = 0; ; j++) {
1838                 if (i == 0xd && j == 64) {
1839                     break;
1840                 }
1841 
1842                 c->function = i;
1843                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1844                 c->index = j;
1845                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1846 
1847                 if (i == 4 && c->eax == 0) {
1848                     break;
1849                 }
1850                 if (i == 0xb && !(c->ecx & 0xff00)) {
1851                     break;
1852                 }
1853                 if (i == 0x1f && !(c->ecx & 0xff00)) {
1854                     break;
1855                 }
1856                 if (i == 0xd && c->eax == 0) {
1857                     continue;
1858                 }
1859                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1860                     goto full;
1861                 }
1862                 c = &entries[cpuid_i++];
1863             }
1864             break;
1865         case 0x12:
1866             for (j = 0; ; j++) {
1867                 c->function = i;
1868                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1869                 c->index = j;
1870                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1871 
1872                 if (j > 1 && (c->eax & 0xf) != 1) {
1873                     break;
1874                 }
1875 
1876                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1877                     goto full;
1878                 }
1879                 c = &entries[cpuid_i++];
1880             }
1881             break;
1882         case 0x7:
1883         case 0x14:
1884         case 0x1d:
1885         case 0x1e: {
1886             uint32_t times;
1887 
1888             c->function = i;
1889             c->index = 0;
1890             c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1891             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1892             times = c->eax;
1893 
1894             for (j = 1; j <= times; ++j) {
1895                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1896                     goto full;
1897                 }
1898                 c = &entries[cpuid_i++];
1899                 c->function = i;
1900                 c->index = j;
1901                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1902                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1903             }
1904             break;
1905         }
1906         default:
1907             c->function = i;
1908             c->flags = 0;
1909             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1910             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1911                 /*
1912                  * KVM already returns all zeroes if a CPUID entry is missing,
1913                  * so we can omit it and avoid hitting KVM's 80-entry limit.
1914                  */
1915                 cpuid_i--;
1916             }
1917             break;
1918         }
1919     }
1920 
1921     if (limit >= 0x0a) {
1922         uint32_t eax, edx;
1923 
1924         cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
1925 
1926         has_architectural_pmu_version = eax & 0xff;
1927         if (has_architectural_pmu_version > 0) {
1928             num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
1929 
1930             /* Shouldn't be more than 32, since that's the number of bits
1931              * available in EBX to tell us _which_ counters are available.
1932              * Play it safe.
1933              */
1934             if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
1935                 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
1936             }
1937 
1938             if (has_architectural_pmu_version > 1) {
1939                 num_architectural_pmu_fixed_counters = edx & 0x1f;
1940 
1941                 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
1942                     num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
1943                 }
1944             }
1945         }
1946     }
1947 
1948     cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
1949 
1950     for (i = 0x80000000; i <= limit; i++) {
1951         j = 0;
1952         if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1953             goto full;
1954         }
1955         c = &entries[cpuid_i++];
1956 
1957         switch (i) {
1958         case 0x8000001d:
1959             /* Query for all AMD cache information leaves */
1960             for (j = 0; ; j++) {
1961                 c->function = i;
1962                 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1963                 c->index = j;
1964                 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1965 
1966                 if (c->eax == 0) {
1967                     break;
1968                 }
1969                 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1970                     goto full;
1971                 }
1972                 c = &entries[cpuid_i++];
1973             }
1974             break;
1975         default:
1976             c->function = i;
1977             c->flags = 0;
1978             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1979             if (!c->eax && !c->ebx && !c->ecx && !c->edx) {
1980                 /*
1981                  * KVM already returns all zeroes if a CPUID entry is missing,
1982                  * so we can omit it and avoid hitting KVM's 80-entry limit.
1983                  */
1984                 cpuid_i--;
1985             }
1986             break;
1987         }
1988     }
1989 
1990     /* Call Centaur's CPUID instructions they are supported. */
1991     if (env->cpuid_xlevel2 > 0) {
1992         cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
1993 
1994         for (i = 0xC0000000; i <= limit; i++) {
1995             j = 0;
1996             if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1997                 goto full;
1998             }
1999             c = &entries[cpuid_i++];
2000 
2001             c->function = i;
2002             c->flags = 0;
2003             cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
2004         }
2005     }
2006 
2007     return cpuid_i;
2008 
2009 full:
2010     fprintf(stderr, "cpuid_data is full, no space for "
2011             "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
2012     abort();
2013 }
2014 
2015 int kvm_arch_init_vcpu(CPUState *cs)
2016 {
2017     struct {
2018         struct kvm_cpuid2 cpuid;
2019         struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
2020     } cpuid_data;
2021     /*
2022      * The kernel defines these structs with padding fields so there
2023      * should be no extra padding in our cpuid_data struct.
2024      */
2025     QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
2026                       sizeof(struct kvm_cpuid2) +
2027                       sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
2028 
2029     X86CPU *cpu = X86_CPU(cs);
2030     CPUX86State *env = &cpu->env;
2031     uint32_t cpuid_i;
2032     struct kvm_cpuid_entry2 *c;
2033     uint32_t signature[3];
2034     int kvm_base = KVM_CPUID_SIGNATURE;
2035     int max_nested_state_len;
2036     int r;
2037     Error *local_err = NULL;
2038 
2039     memset(&cpuid_data, 0, sizeof(cpuid_data));
2040 
2041     cpuid_i = 0;
2042 
2043     has_xsave2 = kvm_check_extension(cs->kvm_state, KVM_CAP_XSAVE2);
2044 
2045     r = kvm_arch_set_tsc_khz(cs);
2046     if (r < 0) {
2047         return r;
2048     }
2049 
2050     /* vcpu's TSC frequency is either specified by user, or following
2051      * the value used by KVM if the former is not present. In the
2052      * latter case, we query it from KVM and record in env->tsc_khz,
2053      * so that vcpu's TSC frequency can be migrated later via this field.
2054      */
2055     if (!env->tsc_khz) {
2056         r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
2057             kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
2058             -ENOTSUP;
2059         if (r > 0) {
2060             env->tsc_khz = r;
2061         }
2062     }
2063 
2064     env->apic_bus_freq = KVM_APIC_BUS_FREQUENCY;
2065 
2066     /*
2067      * kvm_hyperv_expand_features() is called here for the second time in case
2068      * KVM_CAP_SYS_HYPERV_CPUID is not supported. While we can't possibly handle
2069      * 'query-cpu-model-expansion' in this case as we don't have a KVM vCPU to
2070      * check which Hyper-V enlightenments are supported and which are not, we
2071      * can still proceed and check/expand Hyper-V enlightenments here so legacy
2072      * behavior is preserved.
2073      */
2074     if (!kvm_hyperv_expand_features(cpu, &local_err)) {
2075         error_report_err(local_err);
2076         return -ENOSYS;
2077     }
2078 
2079     if (hyperv_enabled(cpu)) {
2080         r = hyperv_init_vcpu(cpu);
2081         if (r) {
2082             return r;
2083         }
2084 
2085         cpuid_i = hyperv_fill_cpuids(cs, cpuid_data.entries);
2086         kvm_base = KVM_CPUID_SIGNATURE_NEXT;
2087         has_msr_hv_hypercall = true;
2088     }
2089 
2090     if (cs->kvm_state->xen_version) {
2091 #ifdef CONFIG_XEN_EMU
2092         struct kvm_cpuid_entry2 *xen_max_leaf;
2093 
2094         memcpy(signature, "XenVMMXenVMM", 12);
2095 
2096         xen_max_leaf = c = &cpuid_data.entries[cpuid_i++];
2097         c->function = kvm_base + XEN_CPUID_SIGNATURE;
2098         c->eax = kvm_base + XEN_CPUID_TIME;
2099         c->ebx = signature[0];
2100         c->ecx = signature[1];
2101         c->edx = signature[2];
2102 
2103         c = &cpuid_data.entries[cpuid_i++];
2104         c->function = kvm_base + XEN_CPUID_VENDOR;
2105         c->eax = cs->kvm_state->xen_version;
2106         c->ebx = 0;
2107         c->ecx = 0;
2108         c->edx = 0;
2109 
2110         c = &cpuid_data.entries[cpuid_i++];
2111         c->function = kvm_base + XEN_CPUID_HVM_MSR;
2112         /* Number of hypercall-transfer pages */
2113         c->eax = 1;
2114         /* Hypercall MSR base address */
2115         if (hyperv_enabled(cpu)) {
2116             c->ebx = XEN_HYPERCALL_MSR_HYPERV;
2117             kvm_xen_init(cs->kvm_state, c->ebx);
2118         } else {
2119             c->ebx = XEN_HYPERCALL_MSR;
2120         }
2121         c->ecx = 0;
2122         c->edx = 0;
2123 
2124         c = &cpuid_data.entries[cpuid_i++];
2125         c->function = kvm_base + XEN_CPUID_TIME;
2126         c->eax = ((!!tsc_is_stable_and_known(env) << 1) |
2127             (!!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP) << 2));
2128         /* default=0 (emulate if necessary) */
2129         c->ebx = 0;
2130         /* guest tsc frequency */
2131         c->ecx = env->user_tsc_khz;
2132         /* guest tsc incarnation (migration count) */
2133         c->edx = 0;
2134 
2135         c = &cpuid_data.entries[cpuid_i++];
2136         c->function = kvm_base + XEN_CPUID_HVM;
2137         xen_max_leaf->eax = kvm_base + XEN_CPUID_HVM;
2138         if (cs->kvm_state->xen_version >= XEN_VERSION(4, 5)) {
2139             c->function = kvm_base + XEN_CPUID_HVM;
2140 
2141             if (cpu->xen_vapic) {
2142                 c->eax |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
2143                 c->eax |= XEN_HVM_CPUID_X2APIC_VIRT;
2144             }
2145 
2146             c->eax |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
2147 
2148             if (cs->kvm_state->xen_version >= XEN_VERSION(4, 6)) {
2149                 c->eax |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
2150                 c->ebx = cs->cpu_index;
2151             }
2152 
2153             if (cs->kvm_state->xen_version >= XEN_VERSION(4, 17)) {
2154                 c->eax |= XEN_HVM_CPUID_UPCALL_VECTOR;
2155             }
2156         }
2157 
2158         r = kvm_xen_init_vcpu(cs);
2159         if (r) {
2160             return r;
2161         }
2162 
2163         kvm_base += 0x100;
2164 #else /* CONFIG_XEN_EMU */
2165         /* This should never happen as kvm_arch_init() would have died first. */
2166         fprintf(stderr, "Cannot enable Xen CPUID without Xen support\n");
2167         abort();
2168 #endif
2169     } else if (cpu->expose_kvm) {
2170         memcpy(signature, "KVMKVMKVM\0\0\0", 12);
2171         c = &cpuid_data.entries[cpuid_i++];
2172         c->function = KVM_CPUID_SIGNATURE | kvm_base;
2173         c->eax = KVM_CPUID_FEATURES | kvm_base;
2174         c->ebx = signature[0];
2175         c->ecx = signature[1];
2176         c->edx = signature[2];
2177 
2178         c = &cpuid_data.entries[cpuid_i++];
2179         c->function = KVM_CPUID_FEATURES | kvm_base;
2180         c->eax = env->features[FEAT_KVM];
2181         c->edx = env->features[FEAT_KVM_HINTS];
2182     }
2183 
2184     if (cpu->kvm_pv_enforce_cpuid) {
2185         r = kvm_vcpu_enable_cap(cs, KVM_CAP_ENFORCE_PV_FEATURE_CPUID, 0, 1);
2186         if (r < 0) {
2187             fprintf(stderr,
2188                     "failed to enable KVM_CAP_ENFORCE_PV_FEATURE_CPUID: %s",
2189                     strerror(-r));
2190             abort();
2191         }
2192     }
2193 
2194     cpuid_i = kvm_x86_build_cpuid(env, cpuid_data.entries, cpuid_i);
2195     cpuid_data.cpuid.nent = cpuid_i;
2196 
2197     if (((env->cpuid_version >> 8)&0xF) >= 6
2198         && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
2199            (CPUID_MCE | CPUID_MCA)) {
2200         uint64_t mcg_cap, unsupported_caps;
2201         int banks;
2202         int ret;
2203 
2204         ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
2205         if (ret < 0) {
2206             fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
2207             return ret;
2208         }
2209 
2210         if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
2211             error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
2212                          (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
2213             return -ENOTSUP;
2214         }
2215 
2216         unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
2217         if (unsupported_caps) {
2218             if (unsupported_caps & MCG_LMCE_P) {
2219                 error_report("kvm: LMCE not supported");
2220                 return -ENOTSUP;
2221             }
2222             warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
2223                         unsupported_caps);
2224         }
2225 
2226         env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
2227         ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
2228         if (ret < 0) {
2229             fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
2230             return ret;
2231         }
2232     }
2233 
2234     cpu->vmsentry = qemu_add_vm_change_state_handler(cpu_update_state, env);
2235 
2236     c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
2237     if (c) {
2238         has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
2239                                   !!(c->ecx & CPUID_EXT_SMX);
2240     }
2241 
2242     c = cpuid_find_entry(&cpuid_data.cpuid, 7, 0);
2243     if (c && (c->ebx & CPUID_7_0_EBX_SGX)) {
2244         has_msr_feature_control = true;
2245     }
2246 
2247     if (env->mcg_cap & MCG_LMCE_P) {
2248         has_msr_mcg_ext_ctl = has_msr_feature_control = true;
2249     }
2250 
2251     if (!env->user_tsc_khz) {
2252         if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
2253             invtsc_mig_blocker == NULL) {
2254             error_setg(&invtsc_mig_blocker,
2255                        "State blocked by non-migratable CPU device"
2256                        " (invtsc flag)");
2257             r = migrate_add_blocker(&invtsc_mig_blocker, &local_err);
2258             if (r < 0) {
2259                 error_report_err(local_err);
2260                 return r;
2261             }
2262         }
2263     }
2264 
2265     if (cpu->vmware_cpuid_freq
2266         /* Guests depend on 0x40000000 to detect this feature, so only expose
2267          * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
2268         && cpu->expose_kvm
2269         && kvm_base == KVM_CPUID_SIGNATURE
2270         /* TSC clock must be stable and known for this feature. */
2271         && tsc_is_stable_and_known(env)) {
2272 
2273         c = &cpuid_data.entries[cpuid_i++];
2274         c->function = KVM_CPUID_SIGNATURE | 0x10;
2275         c->eax = env->tsc_khz;
2276         c->ebx = env->apic_bus_freq / 1000; /* Hz to KHz */
2277         c->ecx = c->edx = 0;
2278 
2279         c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
2280         c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
2281     }
2282 
2283     cpuid_data.cpuid.nent = cpuid_i;
2284 
2285     cpuid_data.cpuid.padding = 0;
2286     r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
2287     if (r) {
2288         goto fail;
2289     }
2290     kvm_init_xsave(env);
2291 
2292     max_nested_state_len = kvm_max_nested_state_length();
2293     if (max_nested_state_len > 0) {
2294         assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
2295 
2296         if (cpu_has_vmx(env) || cpu_has_svm(env)) {
2297             env->nested_state = g_malloc0(max_nested_state_len);
2298             env->nested_state->size = max_nested_state_len;
2299 
2300             kvm_init_nested_state(env);
2301         }
2302     }
2303 
2304     cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
2305 
2306     if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
2307         has_msr_tsc_aux = false;
2308     }
2309 
2310     kvm_init_msrs(cpu);
2311 
2312     return 0;
2313 
2314  fail:
2315     migrate_del_blocker(&invtsc_mig_blocker);
2316 
2317     return r;
2318 }
2319 
2320 int kvm_arch_destroy_vcpu(CPUState *cs)
2321 {
2322     X86CPU *cpu = X86_CPU(cs);
2323     CPUX86State *env = &cpu->env;
2324 
2325     g_free(env->xsave_buf);
2326 
2327     g_free(cpu->kvm_msr_buf);
2328     cpu->kvm_msr_buf = NULL;
2329 
2330     g_free(env->nested_state);
2331     env->nested_state = NULL;
2332 
2333     qemu_del_vm_change_state_handler(cpu->vmsentry);
2334 
2335     return 0;
2336 }
2337 
2338 void kvm_arch_reset_vcpu(X86CPU *cpu)
2339 {
2340     CPUX86State *env = &cpu->env;
2341 
2342     env->xcr0 = 1;
2343     if (kvm_irqchip_in_kernel()) {
2344         env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
2345                                           KVM_MP_STATE_UNINITIALIZED;
2346     } else {
2347         env->mp_state = KVM_MP_STATE_RUNNABLE;
2348     }
2349 
2350     /* enabled by default */
2351     env->poll_control_msr = 1;
2352 
2353     kvm_init_nested_state(env);
2354 
2355     sev_es_set_reset_vector(CPU(cpu));
2356 }
2357 
2358 void kvm_arch_after_reset_vcpu(X86CPU *cpu)
2359 {
2360     CPUX86State *env = &cpu->env;
2361     int i;
2362 
2363     /*
2364      * Reset SynIC after all other devices have been reset to let them remove
2365      * their SINT routes first.
2366      */
2367     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
2368         for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
2369             env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
2370         }
2371 
2372         hyperv_x86_synic_reset(cpu);
2373     }
2374 }
2375 
2376 void kvm_arch_do_init_vcpu(X86CPU *cpu)
2377 {
2378     CPUX86State *env = &cpu->env;
2379 
2380     /* APs get directly into wait-for-SIPI state.  */
2381     if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
2382         env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
2383     }
2384 }
2385 
2386 static int kvm_get_supported_feature_msrs(KVMState *s)
2387 {
2388     int ret = 0;
2389 
2390     if (kvm_feature_msrs != NULL) {
2391         return 0;
2392     }
2393 
2394     if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
2395         return 0;
2396     }
2397 
2398     struct kvm_msr_list msr_list;
2399 
2400     msr_list.nmsrs = 0;
2401     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
2402     if (ret < 0 && ret != -E2BIG) {
2403         error_report("Fetch KVM feature MSR list failed: %s",
2404             strerror(-ret));
2405         return ret;
2406     }
2407 
2408     assert(msr_list.nmsrs > 0);
2409     kvm_feature_msrs = g_malloc0(sizeof(msr_list) +
2410                  msr_list.nmsrs * sizeof(msr_list.indices[0]));
2411 
2412     kvm_feature_msrs->nmsrs = msr_list.nmsrs;
2413     ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
2414 
2415     if (ret < 0) {
2416         error_report("Fetch KVM feature MSR list failed: %s",
2417             strerror(-ret));
2418         g_free(kvm_feature_msrs);
2419         kvm_feature_msrs = NULL;
2420         return ret;
2421     }
2422 
2423     return 0;
2424 }
2425 
2426 static int kvm_get_supported_msrs(KVMState *s)
2427 {
2428     int ret = 0;
2429     struct kvm_msr_list msr_list, *kvm_msr_list;
2430 
2431     /*
2432      *  Obtain MSR list from KVM.  These are the MSRs that we must
2433      *  save/restore.
2434      */
2435     msr_list.nmsrs = 0;
2436     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
2437     if (ret < 0 && ret != -E2BIG) {
2438         return ret;
2439     }
2440     /*
2441      * Old kernel modules had a bug and could write beyond the provided
2442      * memory. Allocate at least a safe amount of 1K.
2443      */
2444     kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
2445                                           msr_list.nmsrs *
2446                                           sizeof(msr_list.indices[0])));
2447 
2448     kvm_msr_list->nmsrs = msr_list.nmsrs;
2449     ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
2450     if (ret >= 0) {
2451         int i;
2452 
2453         for (i = 0; i < kvm_msr_list->nmsrs; i++) {
2454             switch (kvm_msr_list->indices[i]) {
2455             case MSR_STAR:
2456                 has_msr_star = true;
2457                 break;
2458             case MSR_VM_HSAVE_PA:
2459                 has_msr_hsave_pa = true;
2460                 break;
2461             case MSR_TSC_AUX:
2462                 has_msr_tsc_aux = true;
2463                 break;
2464             case MSR_TSC_ADJUST:
2465                 has_msr_tsc_adjust = true;
2466                 break;
2467             case MSR_IA32_TSCDEADLINE:
2468                 has_msr_tsc_deadline = true;
2469                 break;
2470             case MSR_IA32_SMBASE:
2471                 has_msr_smbase = true;
2472                 break;
2473             case MSR_SMI_COUNT:
2474                 has_msr_smi_count = true;
2475                 break;
2476             case MSR_IA32_MISC_ENABLE:
2477                 has_msr_misc_enable = true;
2478                 break;
2479             case MSR_IA32_BNDCFGS:
2480                 has_msr_bndcfgs = true;
2481                 break;
2482             case MSR_IA32_XSS:
2483                 has_msr_xss = true;
2484                 break;
2485             case MSR_IA32_UMWAIT_CONTROL:
2486                 has_msr_umwait = true;
2487                 break;
2488             case HV_X64_MSR_CRASH_CTL:
2489                 has_msr_hv_crash = true;
2490                 break;
2491             case HV_X64_MSR_RESET:
2492                 has_msr_hv_reset = true;
2493                 break;
2494             case HV_X64_MSR_VP_INDEX:
2495                 has_msr_hv_vpindex = true;
2496                 break;
2497             case HV_X64_MSR_VP_RUNTIME:
2498                 has_msr_hv_runtime = true;
2499                 break;
2500             case HV_X64_MSR_SCONTROL:
2501                 has_msr_hv_synic = true;
2502                 break;
2503             case HV_X64_MSR_STIMER0_CONFIG:
2504                 has_msr_hv_stimer = true;
2505                 break;
2506             case HV_X64_MSR_TSC_FREQUENCY:
2507                 has_msr_hv_frequencies = true;
2508                 break;
2509             case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
2510                 has_msr_hv_reenlightenment = true;
2511                 break;
2512             case HV_X64_MSR_SYNDBG_OPTIONS:
2513                 has_msr_hv_syndbg_options = true;
2514                 break;
2515             case MSR_IA32_SPEC_CTRL:
2516                 has_msr_spec_ctrl = true;
2517                 break;
2518             case MSR_AMD64_TSC_RATIO:
2519                 has_tsc_scale_msr = true;
2520                 break;
2521             case MSR_IA32_TSX_CTRL:
2522                 has_msr_tsx_ctrl = true;
2523                 break;
2524             case MSR_VIRT_SSBD:
2525                 has_msr_virt_ssbd = true;
2526                 break;
2527             case MSR_IA32_ARCH_CAPABILITIES:
2528                 has_msr_arch_capabs = true;
2529                 break;
2530             case MSR_IA32_CORE_CAPABILITY:
2531                 has_msr_core_capabs = true;
2532                 break;
2533             case MSR_IA32_PERF_CAPABILITIES:
2534                 has_msr_perf_capabs = true;
2535                 break;
2536             case MSR_IA32_VMX_VMFUNC:
2537                 has_msr_vmx_vmfunc = true;
2538                 break;
2539             case MSR_IA32_UCODE_REV:
2540                 has_msr_ucode_rev = true;
2541                 break;
2542             case MSR_IA32_VMX_PROCBASED_CTLS2:
2543                 has_msr_vmx_procbased_ctls2 = true;
2544                 break;
2545             case MSR_IA32_PKRS:
2546                 has_msr_pkrs = true;
2547                 break;
2548             }
2549         }
2550     }
2551 
2552     g_free(kvm_msr_list);
2553 
2554     return ret;
2555 }
2556 
2557 static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
2558                                         uint64_t *val)
2559 {
2560     CPUState *cs = CPU(cpu);
2561 
2562     *val = cs->nr_threads * cs->nr_cores; /* thread count, bits 15..0 */
2563     *val |= ((uint32_t)cs->nr_cores << 16); /* core count, bits 31..16 */
2564 
2565     return true;
2566 }
2567 
2568 static Notifier smram_machine_done;
2569 static KVMMemoryListener smram_listener;
2570 static AddressSpace smram_address_space;
2571 static MemoryRegion smram_as_root;
2572 static MemoryRegion smram_as_mem;
2573 
2574 static void register_smram_listener(Notifier *n, void *unused)
2575 {
2576     MemoryRegion *smram =
2577         (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
2578 
2579     /* Outer container... */
2580     memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
2581     memory_region_set_enabled(&smram_as_root, true);
2582 
2583     /* ... with two regions inside: normal system memory with low
2584      * priority, and...
2585      */
2586     memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
2587                              get_system_memory(), 0, ~0ull);
2588     memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
2589     memory_region_set_enabled(&smram_as_mem, true);
2590 
2591     if (smram) {
2592         /* ... SMRAM with higher priority */
2593         memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
2594         memory_region_set_enabled(smram, true);
2595     }
2596 
2597     address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
2598     kvm_memory_listener_register(kvm_state, &smram_listener,
2599                                  &smram_address_space, 1, "kvm-smram");
2600 }
2601 
2602 int kvm_arch_get_default_type(MachineState *ms)
2603 {
2604     return 0;
2605 }
2606 
2607 int kvm_arch_init(MachineState *ms, KVMState *s)
2608 {
2609     uint64_t identity_base = 0xfffbc000;
2610     uint64_t shadow_mem;
2611     int ret;
2612     struct utsname utsname;
2613     Error *local_err = NULL;
2614 
2615     /*
2616      * Initialize SEV context, if required
2617      *
2618      * If no memory encryption is requested (ms->cgs == NULL) this is
2619      * a no-op.
2620      *
2621      * It's also a no-op if a non-SEV confidential guest support
2622      * mechanism is selected.  SEV is the only mechanism available to
2623      * select on x86 at present, so this doesn't arise, but if new
2624      * mechanisms are supported in future (e.g. TDX), they'll need
2625      * their own initialization either here or elsewhere.
2626      */
2627     if (ms->cgs) {
2628         ret = confidential_guest_kvm_init(ms->cgs, &local_err);
2629         if (ret < 0) {
2630             error_report_err(local_err);
2631             return ret;
2632         }
2633     }
2634 
2635     has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
2636     has_sregs2 = kvm_check_extension(s, KVM_CAP_SREGS2) > 0;
2637 
2638     hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
2639 
2640     has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
2641     if (has_exception_payload) {
2642         ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
2643         if (ret < 0) {
2644             error_report("kvm: Failed to enable exception payload cap: %s",
2645                          strerror(-ret));
2646             return ret;
2647         }
2648     }
2649 
2650     has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT);
2651     if (has_triple_fault_event) {
2652         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true);
2653         if (ret < 0) {
2654             error_report("kvm: Failed to enable triple fault event cap: %s",
2655                          strerror(-ret));
2656             return ret;
2657         }
2658     }
2659 
2660     if (s->xen_version) {
2661 #ifdef CONFIG_XEN_EMU
2662         if (!object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE)) {
2663             error_report("kvm: Xen support only available in PC machine");
2664             return -ENOTSUP;
2665         }
2666         /* hyperv_enabled() doesn't work yet. */
2667         uint32_t msr = XEN_HYPERCALL_MSR;
2668         ret = kvm_xen_init(s, msr);
2669         if (ret < 0) {
2670             return ret;
2671         }
2672 #else
2673         error_report("kvm: Xen support not enabled in qemu");
2674         return -ENOTSUP;
2675 #endif
2676     }
2677 
2678     ret = kvm_get_supported_msrs(s);
2679     if (ret < 0) {
2680         return ret;
2681     }
2682 
2683     kvm_get_supported_feature_msrs(s);
2684 
2685     uname(&utsname);
2686     lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
2687 
2688     /*
2689      * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
2690      * In order to use vm86 mode, an EPT identity map and a TSS  are needed.
2691      * Since these must be part of guest physical memory, we need to allocate
2692      * them, both by setting their start addresses in the kernel and by
2693      * creating a corresponding e820 entry. We need 4 pages before the BIOS,
2694      * so this value allows up to 16M BIOSes.
2695      */
2696     identity_base = 0xfeffc000;
2697     ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
2698     if (ret < 0) {
2699         return ret;
2700     }
2701 
2702     /* Set TSS base one page after EPT identity map. */
2703     ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
2704     if (ret < 0) {
2705         return ret;
2706     }
2707 
2708     /* Tell fw_cfg to notify the BIOS to reserve the range. */
2709     e820_add_entry(identity_base, 0x4000, E820_RESERVED);
2710 
2711     shadow_mem = object_property_get_int(OBJECT(s), "kvm-shadow-mem", &error_abort);
2712     if (shadow_mem != -1) {
2713         shadow_mem /= 4096;
2714         ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
2715         if (ret < 0) {
2716             return ret;
2717         }
2718     }
2719 
2720     if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
2721         object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) &&
2722         x86_machine_is_smm_enabled(X86_MACHINE(ms))) {
2723         smram_machine_done.notify = register_smram_listener;
2724         qemu_add_machine_init_done_notifier(&smram_machine_done);
2725     }
2726 
2727     if (enable_cpu_pm) {
2728         int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
2729 /* Work around for kernel header with a typo. TODO: fix header and drop. */
2730 #if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
2731 #define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
2732 #endif
2733         if (disable_exits) {
2734             disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
2735                               KVM_X86_DISABLE_EXITS_HLT |
2736                               KVM_X86_DISABLE_EXITS_PAUSE |
2737                               KVM_X86_DISABLE_EXITS_CSTATE);
2738         }
2739 
2740         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
2741                                 disable_exits);
2742         if (ret < 0) {
2743             error_report("kvm: guest stopping CPU not supported: %s",
2744                          strerror(-ret));
2745         }
2746     }
2747 
2748     if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) {
2749         X86MachineState *x86ms = X86_MACHINE(ms);
2750 
2751         if (x86ms->bus_lock_ratelimit > 0) {
2752             ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT);
2753             if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) {
2754                 error_report("kvm: bus lock detection unsupported");
2755                 return -ENOTSUP;
2756             }
2757             ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0,
2758                                     KVM_BUS_LOCK_DETECTION_EXIT);
2759             if (ret < 0) {
2760                 error_report("kvm: Failed to enable bus lock detection cap: %s",
2761                              strerror(-ret));
2762                 return ret;
2763             }
2764             ratelimit_init(&bus_lock_ratelimit_ctrl);
2765             ratelimit_set_speed(&bus_lock_ratelimit_ctrl,
2766                                 x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME);
2767         }
2768     }
2769 
2770     if (s->notify_vmexit != NOTIFY_VMEXIT_OPTION_DISABLE &&
2771         kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) {
2772             uint64_t notify_window_flags =
2773                 ((uint64_t)s->notify_window << 32) |
2774                 KVM_X86_NOTIFY_VMEXIT_ENABLED |
2775                 KVM_X86_NOTIFY_VMEXIT_USER;
2776             ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0,
2777                                     notify_window_flags);
2778             if (ret < 0) {
2779                 error_report("kvm: Failed to enable notify vmexit cap: %s",
2780                              strerror(-ret));
2781                 return ret;
2782             }
2783     }
2784     if (kvm_vm_check_extension(s, KVM_CAP_X86_USER_SPACE_MSR)) {
2785         bool r;
2786 
2787         ret = kvm_vm_enable_cap(s, KVM_CAP_X86_USER_SPACE_MSR, 0,
2788                                 KVM_MSR_EXIT_REASON_FILTER);
2789         if (ret) {
2790             error_report("Could not enable user space MSRs: %s",
2791                          strerror(-ret));
2792             exit(1);
2793         }
2794 
2795         r = kvm_filter_msr(s, MSR_CORE_THREAD_COUNT,
2796                            kvm_rdmsr_core_thread_count, NULL);
2797         if (!r) {
2798             error_report("Could not install MSR_CORE_THREAD_COUNT handler: %s",
2799                          strerror(-ret));
2800             exit(1);
2801         }
2802     }
2803 
2804     return 0;
2805 }
2806 
2807 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2808 {
2809     lhs->selector = rhs->selector;
2810     lhs->base = rhs->base;
2811     lhs->limit = rhs->limit;
2812     lhs->type = 3;
2813     lhs->present = 1;
2814     lhs->dpl = 3;
2815     lhs->db = 0;
2816     lhs->s = 1;
2817     lhs->l = 0;
2818     lhs->g = 0;
2819     lhs->avl = 0;
2820     lhs->unusable = 0;
2821 }
2822 
2823 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2824 {
2825     unsigned flags = rhs->flags;
2826     lhs->selector = rhs->selector;
2827     lhs->base = rhs->base;
2828     lhs->limit = rhs->limit;
2829     lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
2830     lhs->present = (flags & DESC_P_MASK) != 0;
2831     lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
2832     lhs->db = (flags >> DESC_B_SHIFT) & 1;
2833     lhs->s = (flags & DESC_S_MASK) != 0;
2834     lhs->l = (flags >> DESC_L_SHIFT) & 1;
2835     lhs->g = (flags & DESC_G_MASK) != 0;
2836     lhs->avl = (flags & DESC_AVL_MASK) != 0;
2837     lhs->unusable = !lhs->present;
2838     lhs->padding = 0;
2839 }
2840 
2841 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
2842 {
2843     lhs->selector = rhs->selector;
2844     lhs->base = rhs->base;
2845     lhs->limit = rhs->limit;
2846     lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
2847                  ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
2848                  (rhs->dpl << DESC_DPL_SHIFT) |
2849                  (rhs->db << DESC_B_SHIFT) |
2850                  (rhs->s * DESC_S_MASK) |
2851                  (rhs->l << DESC_L_SHIFT) |
2852                  (rhs->g * DESC_G_MASK) |
2853                  (rhs->avl * DESC_AVL_MASK);
2854 }
2855 
2856 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
2857 {
2858     if (set) {
2859         *kvm_reg = *qemu_reg;
2860     } else {
2861         *qemu_reg = *kvm_reg;
2862     }
2863 }
2864 
2865 static int kvm_getput_regs(X86CPU *cpu, int set)
2866 {
2867     CPUX86State *env = &cpu->env;
2868     struct kvm_regs regs;
2869     int ret = 0;
2870 
2871     if (!set) {
2872         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
2873         if (ret < 0) {
2874             return ret;
2875         }
2876     }
2877 
2878     kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
2879     kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
2880     kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
2881     kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
2882     kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
2883     kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
2884     kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
2885     kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
2886 #ifdef TARGET_X86_64
2887     kvm_getput_reg(&regs.r8, &env->regs[8], set);
2888     kvm_getput_reg(&regs.r9, &env->regs[9], set);
2889     kvm_getput_reg(&regs.r10, &env->regs[10], set);
2890     kvm_getput_reg(&regs.r11, &env->regs[11], set);
2891     kvm_getput_reg(&regs.r12, &env->regs[12], set);
2892     kvm_getput_reg(&regs.r13, &env->regs[13], set);
2893     kvm_getput_reg(&regs.r14, &env->regs[14], set);
2894     kvm_getput_reg(&regs.r15, &env->regs[15], set);
2895 #endif
2896 
2897     kvm_getput_reg(&regs.rflags, &env->eflags, set);
2898     kvm_getput_reg(&regs.rip, &env->eip, set);
2899 
2900     if (set) {
2901         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
2902     }
2903 
2904     return ret;
2905 }
2906 
2907 static int kvm_put_xsave(X86CPU *cpu)
2908 {
2909     CPUX86State *env = &cpu->env;
2910     void *xsave = env->xsave_buf;
2911 
2912     x86_cpu_xsave_all_areas(cpu, xsave, env->xsave_buf_len);
2913 
2914     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
2915 }
2916 
2917 static int kvm_put_xcrs(X86CPU *cpu)
2918 {
2919     CPUX86State *env = &cpu->env;
2920     struct kvm_xcrs xcrs = {};
2921 
2922     if (!has_xcrs) {
2923         return 0;
2924     }
2925 
2926     xcrs.nr_xcrs = 1;
2927     xcrs.flags = 0;
2928     xcrs.xcrs[0].xcr = 0;
2929     xcrs.xcrs[0].value = env->xcr0;
2930     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
2931 }
2932 
2933 static int kvm_put_sregs(X86CPU *cpu)
2934 {
2935     CPUX86State *env = &cpu->env;
2936     struct kvm_sregs sregs;
2937 
2938     /*
2939      * The interrupt_bitmap is ignored because KVM_SET_SREGS is
2940      * always followed by KVM_SET_VCPU_EVENTS.
2941      */
2942     memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
2943 
2944     if ((env->eflags & VM_MASK)) {
2945         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2946         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2947         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2948         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2949         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2950         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2951     } else {
2952         set_seg(&sregs.cs, &env->segs[R_CS]);
2953         set_seg(&sregs.ds, &env->segs[R_DS]);
2954         set_seg(&sregs.es, &env->segs[R_ES]);
2955         set_seg(&sregs.fs, &env->segs[R_FS]);
2956         set_seg(&sregs.gs, &env->segs[R_GS]);
2957         set_seg(&sregs.ss, &env->segs[R_SS]);
2958     }
2959 
2960     set_seg(&sregs.tr, &env->tr);
2961     set_seg(&sregs.ldt, &env->ldt);
2962 
2963     sregs.idt.limit = env->idt.limit;
2964     sregs.idt.base = env->idt.base;
2965     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2966     sregs.gdt.limit = env->gdt.limit;
2967     sregs.gdt.base = env->gdt.base;
2968     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2969 
2970     sregs.cr0 = env->cr[0];
2971     sregs.cr2 = env->cr[2];
2972     sregs.cr3 = env->cr[3];
2973     sregs.cr4 = env->cr[4];
2974 
2975     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2976     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2977 
2978     sregs.efer = env->efer;
2979 
2980     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
2981 }
2982 
2983 static int kvm_put_sregs2(X86CPU *cpu)
2984 {
2985     CPUX86State *env = &cpu->env;
2986     struct kvm_sregs2 sregs;
2987     int i;
2988 
2989     sregs.flags = 0;
2990 
2991     if ((env->eflags & VM_MASK)) {
2992         set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2993         set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2994         set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2995         set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2996         set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2997         set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2998     } else {
2999         set_seg(&sregs.cs, &env->segs[R_CS]);
3000         set_seg(&sregs.ds, &env->segs[R_DS]);
3001         set_seg(&sregs.es, &env->segs[R_ES]);
3002         set_seg(&sregs.fs, &env->segs[R_FS]);
3003         set_seg(&sregs.gs, &env->segs[R_GS]);
3004         set_seg(&sregs.ss, &env->segs[R_SS]);
3005     }
3006 
3007     set_seg(&sregs.tr, &env->tr);
3008     set_seg(&sregs.ldt, &env->ldt);
3009 
3010     sregs.idt.limit = env->idt.limit;
3011     sregs.idt.base = env->idt.base;
3012     memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
3013     sregs.gdt.limit = env->gdt.limit;
3014     sregs.gdt.base = env->gdt.base;
3015     memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
3016 
3017     sregs.cr0 = env->cr[0];
3018     sregs.cr2 = env->cr[2];
3019     sregs.cr3 = env->cr[3];
3020     sregs.cr4 = env->cr[4];
3021 
3022     sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
3023     sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
3024 
3025     sregs.efer = env->efer;
3026 
3027     if (env->pdptrs_valid) {
3028         for (i = 0; i < 4; i++) {
3029             sregs.pdptrs[i] = env->pdptrs[i];
3030         }
3031         sregs.flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
3032     }
3033 
3034     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS2, &sregs);
3035 }
3036 
3037 
3038 static void kvm_msr_buf_reset(X86CPU *cpu)
3039 {
3040     memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
3041 }
3042 
3043 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
3044 {
3045     struct kvm_msrs *msrs = cpu->kvm_msr_buf;
3046     void *limit = ((void *)msrs) + MSR_BUF_SIZE;
3047     struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
3048 
3049     assert((void *)(entry + 1) <= limit);
3050 
3051     entry->index = index;
3052     entry->reserved = 0;
3053     entry->data = value;
3054     msrs->nmsrs++;
3055 }
3056 
3057 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
3058 {
3059     kvm_msr_buf_reset(cpu);
3060     kvm_msr_entry_add(cpu, index, value);
3061 
3062     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
3063 }
3064 
3065 static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value)
3066 {
3067     int ret;
3068     struct {
3069         struct kvm_msrs info;
3070         struct kvm_msr_entry entries[1];
3071     } msr_data = {
3072         .info.nmsrs = 1,
3073         .entries[0].index = index,
3074     };
3075 
3076     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
3077     if (ret < 0) {
3078         return ret;
3079     }
3080     assert(ret == 1);
3081     *value = msr_data.entries[0].data;
3082     return ret;
3083 }
3084 void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
3085 {
3086     int ret;
3087 
3088     ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
3089     assert(ret == 1);
3090 }
3091 
3092 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
3093 {
3094     CPUX86State *env = &cpu->env;
3095     int ret;
3096 
3097     if (!has_msr_tsc_deadline) {
3098         return 0;
3099     }
3100 
3101     ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
3102     if (ret < 0) {
3103         return ret;
3104     }
3105 
3106     assert(ret == 1);
3107     return 0;
3108 }
3109 
3110 /*
3111  * Provide a separate write service for the feature control MSR in order to
3112  * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
3113  * before writing any other state because forcibly leaving nested mode
3114  * invalidates the VCPU state.
3115  */
3116 static int kvm_put_msr_feature_control(X86CPU *cpu)
3117 {
3118     int ret;
3119 
3120     if (!has_msr_feature_control) {
3121         return 0;
3122     }
3123 
3124     ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
3125                           cpu->env.msr_ia32_feature_control);
3126     if (ret < 0) {
3127         return ret;
3128     }
3129 
3130     assert(ret == 1);
3131     return 0;
3132 }
3133 
3134 static uint64_t make_vmx_msr_value(uint32_t index, uint32_t features)
3135 {
3136     uint32_t default1, can_be_one, can_be_zero;
3137     uint32_t must_be_one;
3138 
3139     switch (index) {
3140     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
3141         default1 = 0x00000016;
3142         break;
3143     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
3144         default1 = 0x0401e172;
3145         break;
3146     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
3147         default1 = 0x000011ff;
3148         break;
3149     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
3150         default1 = 0x00036dff;
3151         break;
3152     case MSR_IA32_VMX_PROCBASED_CTLS2:
3153         default1 = 0;
3154         break;
3155     default:
3156         abort();
3157     }
3158 
3159     /* If a feature bit is set, the control can be either set or clear.
3160      * Otherwise the value is limited to either 0 or 1 by default1.
3161      */
3162     can_be_one = features | default1;
3163     can_be_zero = features | ~default1;
3164     must_be_one = ~can_be_zero;
3165 
3166     /*
3167      * Bit 0:31 -> 0 if the control bit can be zero (i.e. 1 if it must be one).
3168      * Bit 32:63 -> 1 if the control bit can be one.
3169      */
3170     return must_be_one | (((uint64_t)can_be_one) << 32);
3171 }
3172 
3173 static void kvm_msr_entry_add_vmx(X86CPU *cpu, FeatureWordArray f)
3174 {
3175     uint64_t kvm_vmx_basic =
3176         kvm_arch_get_supported_msr_feature(kvm_state,
3177                                            MSR_IA32_VMX_BASIC);
3178 
3179     if (!kvm_vmx_basic) {
3180         /* If the kernel doesn't support VMX feature (kvm_intel.nested=0),
3181          * then kvm_vmx_basic will be 0 and KVM_SET_MSR will fail.
3182          */
3183         return;
3184     }
3185 
3186     uint64_t kvm_vmx_misc =
3187         kvm_arch_get_supported_msr_feature(kvm_state,
3188                                            MSR_IA32_VMX_MISC);
3189     uint64_t kvm_vmx_ept_vpid =
3190         kvm_arch_get_supported_msr_feature(kvm_state,
3191                                            MSR_IA32_VMX_EPT_VPID_CAP);
3192 
3193     /*
3194      * If the guest is 64-bit, a value of 1 is allowed for the host address
3195      * space size vmexit control.
3196      */
3197     uint64_t fixed_vmx_exit = f[FEAT_8000_0001_EDX] & CPUID_EXT2_LM
3198         ? (uint64_t)VMX_VM_EXIT_HOST_ADDR_SPACE_SIZE << 32 : 0;
3199 
3200     /*
3201      * Bits 0-30, 32-44 and 50-53 come from the host.  KVM should
3202      * not change them for backwards compatibility.
3203      */
3204     uint64_t fixed_vmx_basic = kvm_vmx_basic &
3205         (MSR_VMX_BASIC_VMCS_REVISION_MASK |
3206          MSR_VMX_BASIC_VMXON_REGION_SIZE_MASK |
3207          MSR_VMX_BASIC_VMCS_MEM_TYPE_MASK);
3208 
3209     /*
3210      * Same for bits 0-4 and 25-27.  Bits 16-24 (CR3 target count) can
3211      * change in the future but are always zero for now, clear them to be
3212      * future proof.  Bits 32-63 in theory could change, though KVM does
3213      * not support dual-monitor treatment and probably never will; mask
3214      * them out as well.
3215      */
3216     uint64_t fixed_vmx_misc = kvm_vmx_misc &
3217         (MSR_VMX_MISC_PREEMPTION_TIMER_SHIFT_MASK |
3218          MSR_VMX_MISC_MAX_MSR_LIST_SIZE_MASK);
3219 
3220     /*
3221      * EPT memory types should not change either, so we do not bother
3222      * adding features for them.
3223      */
3224     uint64_t fixed_vmx_ept_mask =
3225             (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_ENABLE_EPT ?
3226              MSR_VMX_EPT_UC | MSR_VMX_EPT_WB : 0);
3227     uint64_t fixed_vmx_ept_vpid = kvm_vmx_ept_vpid & fixed_vmx_ept_mask;
3228 
3229     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
3230                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
3231                                          f[FEAT_VMX_PROCBASED_CTLS]));
3232     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS,
3233                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_PINBASED_CTLS,
3234                                          f[FEAT_VMX_PINBASED_CTLS]));
3235     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_EXIT_CTLS,
3236                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_EXIT_CTLS,
3237                                          f[FEAT_VMX_EXIT_CTLS]) | fixed_vmx_exit);
3238     kvm_msr_entry_add(cpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS,
3239                       make_vmx_msr_value(MSR_IA32_VMX_TRUE_ENTRY_CTLS,
3240                                          f[FEAT_VMX_ENTRY_CTLS]));
3241     kvm_msr_entry_add(cpu, MSR_IA32_VMX_PROCBASED_CTLS2,
3242                       make_vmx_msr_value(MSR_IA32_VMX_PROCBASED_CTLS2,
3243                                          f[FEAT_VMX_SECONDARY_CTLS]));
3244     kvm_msr_entry_add(cpu, MSR_IA32_VMX_EPT_VPID_CAP,
3245                       f[FEAT_VMX_EPT_VPID_CAPS] | fixed_vmx_ept_vpid);
3246     kvm_msr_entry_add(cpu, MSR_IA32_VMX_BASIC,
3247                       f[FEAT_VMX_BASIC] | fixed_vmx_basic);
3248     kvm_msr_entry_add(cpu, MSR_IA32_VMX_MISC,
3249                       f[FEAT_VMX_MISC] | fixed_vmx_misc);
3250     if (has_msr_vmx_vmfunc) {
3251         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMFUNC, f[FEAT_VMX_VMFUNC]);
3252     }
3253 
3254     /*
3255      * Just to be safe, write these with constant values.  The CRn_FIXED1
3256      * MSRs are generated by KVM based on the vCPU's CPUID.
3257      */
3258     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR0_FIXED0,
3259                       CR0_PE_MASK | CR0_PG_MASK | CR0_NE_MASK);
3260     kvm_msr_entry_add(cpu, MSR_IA32_VMX_CR4_FIXED0,
3261                       CR4_VMXE_MASK);
3262 
3263     if (f[FEAT_VMX_SECONDARY_CTLS] & VMX_SECONDARY_EXEC_TSC_SCALING) {
3264         /* TSC multiplier (0x2032).  */
3265         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x32);
3266     } else {
3267         /* Preemption timer (0x482E).  */
3268         kvm_msr_entry_add(cpu, MSR_IA32_VMX_VMCS_ENUM, 0x2E);
3269     }
3270 }
3271 
3272 static void kvm_msr_entry_add_perf(X86CPU *cpu, FeatureWordArray f)
3273 {
3274     uint64_t kvm_perf_cap =
3275         kvm_arch_get_supported_msr_feature(kvm_state,
3276                                            MSR_IA32_PERF_CAPABILITIES);
3277 
3278     if (kvm_perf_cap) {
3279         kvm_msr_entry_add(cpu, MSR_IA32_PERF_CAPABILITIES,
3280                         kvm_perf_cap & f[FEAT_PERF_CAPABILITIES]);
3281     }
3282 }
3283 
3284 static int kvm_buf_set_msrs(X86CPU *cpu)
3285 {
3286     int ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
3287     if (ret < 0) {
3288         return ret;
3289     }
3290 
3291     if (ret < cpu->kvm_msr_buf->nmsrs) {
3292         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
3293         error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
3294                      (uint32_t)e->index, (uint64_t)e->data);
3295     }
3296 
3297     assert(ret == cpu->kvm_msr_buf->nmsrs);
3298     return 0;
3299 }
3300 
3301 static void kvm_init_msrs(X86CPU *cpu)
3302 {
3303     CPUX86State *env = &cpu->env;
3304 
3305     kvm_msr_buf_reset(cpu);
3306     if (has_msr_arch_capabs) {
3307         kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
3308                           env->features[FEAT_ARCH_CAPABILITIES]);
3309     }
3310 
3311     if (has_msr_core_capabs) {
3312         kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
3313                           env->features[FEAT_CORE_CAPABILITY]);
3314     }
3315 
3316     if (has_msr_perf_capabs && cpu->enable_pmu) {
3317         kvm_msr_entry_add_perf(cpu, env->features);
3318     }
3319 
3320     if (has_msr_ucode_rev) {
3321         kvm_msr_entry_add(cpu, MSR_IA32_UCODE_REV, cpu->ucode_rev);
3322     }
3323 
3324     /*
3325      * Older kernels do not include VMX MSRs in KVM_GET_MSR_INDEX_LIST, but
3326      * all kernels with MSR features should have them.
3327      */
3328     if (kvm_feature_msrs && cpu_has_vmx(env)) {
3329         kvm_msr_entry_add_vmx(cpu, env->features);
3330     }
3331 
3332     assert(kvm_buf_set_msrs(cpu) == 0);
3333 }
3334 
3335 static int kvm_put_msrs(X86CPU *cpu, int level)
3336 {
3337     CPUX86State *env = &cpu->env;
3338     int i;
3339 
3340     kvm_msr_buf_reset(cpu);
3341 
3342     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
3343     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
3344     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
3345     kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
3346     if (has_msr_star) {
3347         kvm_msr_entry_add(cpu, MSR_STAR, env->star);
3348     }
3349     if (has_msr_hsave_pa) {
3350         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
3351     }
3352     if (has_msr_tsc_aux) {
3353         kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
3354     }
3355     if (has_msr_tsc_adjust) {
3356         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
3357     }
3358     if (has_msr_misc_enable) {
3359         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
3360                           env->msr_ia32_misc_enable);
3361     }
3362     if (has_msr_smbase) {
3363         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
3364     }
3365     if (has_msr_smi_count) {
3366         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
3367     }
3368     if (has_msr_pkrs) {
3369         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, env->pkrs);
3370     }
3371     if (has_msr_bndcfgs) {
3372         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
3373     }
3374     if (has_msr_xss) {
3375         kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
3376     }
3377     if (has_msr_umwait) {
3378         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, env->umwait);
3379     }
3380     if (has_msr_spec_ctrl) {
3381         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
3382     }
3383     if (has_tsc_scale_msr) {
3384         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, env->amd_tsc_scale_msr);
3385     }
3386 
3387     if (has_msr_tsx_ctrl) {
3388         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, env->tsx_ctrl);
3389     }
3390     if (has_msr_virt_ssbd) {
3391         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
3392     }
3393 
3394 #ifdef TARGET_X86_64
3395     if (lm_capable_kernel) {
3396         kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
3397         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
3398         kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
3399         kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
3400         if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) {
3401             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, env->fred_rsp0);
3402             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, env->fred_rsp1);
3403             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, env->fred_rsp2);
3404             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, env->fred_rsp3);
3405             kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, env->fred_stklvls);
3406             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, env->fred_ssp1);
3407             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, env->fred_ssp2);
3408             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, env->fred_ssp3);
3409             kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, env->fred_config);
3410         }
3411     }
3412 #endif
3413 
3414     /*
3415      * The following MSRs have side effects on the guest or are too heavy
3416      * for normal writeback. Limit them to reset or full state updates.
3417      */
3418     if (level >= KVM_PUT_RESET_STATE) {
3419         kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
3420         kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
3421         kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
3422         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3423             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, env->async_pf_int_msr);
3424         }
3425         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3426             kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
3427         }
3428         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3429             kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
3430         }
3431         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3432             kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
3433         }
3434 
3435         if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3436             kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, env->poll_control_msr);
3437         }
3438 
3439         if (has_architectural_pmu_version > 0) {
3440             if (has_architectural_pmu_version > 1) {
3441                 /* Stop the counter.  */
3442                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3443                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3444             }
3445 
3446             /* Set the counter values.  */
3447             for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3448                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
3449                                   env->msr_fixed_counters[i]);
3450             }
3451             for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3452                 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
3453                                   env->msr_gp_counters[i]);
3454                 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
3455                                   env->msr_gp_evtsel[i]);
3456             }
3457             if (has_architectural_pmu_version > 1) {
3458                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
3459                                   env->msr_global_status);
3460                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
3461                                   env->msr_global_ovf_ctrl);
3462 
3463                 /* Now start the PMU.  */
3464                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
3465                                   env->msr_fixed_ctr_ctrl);
3466                 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
3467                                   env->msr_global_ctrl);
3468             }
3469         }
3470         /*
3471          * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
3472          * only sync them to KVM on the first cpu
3473          */
3474         if (current_cpu == first_cpu) {
3475             if (has_msr_hv_hypercall) {
3476                 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
3477                                   env->msr_hv_guest_os_id);
3478                 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
3479                                   env->msr_hv_hypercall);
3480             }
3481             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3482                 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
3483                                   env->msr_hv_tsc);
3484             }
3485             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3486                 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
3487                                   env->msr_hv_reenlightenment_control);
3488                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
3489                                   env->msr_hv_tsc_emulation_control);
3490                 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
3491                                   env->msr_hv_tsc_emulation_status);
3492             }
3493 #ifdef CONFIG_SYNDBG
3494             if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNDBG) &&
3495                 has_msr_hv_syndbg_options) {
3496                 kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS,
3497                                   hyperv_syndbg_query_options());
3498             }
3499 #endif
3500         }
3501         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3502             kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
3503                               env->msr_hv_vapic);
3504         }
3505         if (has_msr_hv_crash) {
3506             int j;
3507 
3508             for (j = 0; j < HV_CRASH_PARAMS; j++)
3509                 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
3510                                   env->msr_hv_crash_params[j]);
3511 
3512             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
3513         }
3514         if (has_msr_hv_runtime) {
3515             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
3516         }
3517         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
3518             && hv_vpindex_settable) {
3519             kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
3520                               hyperv_vp_index(CPU(cpu)));
3521         }
3522         if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3523             int j;
3524 
3525             kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
3526 
3527             kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
3528                               env->msr_hv_synic_control);
3529             kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
3530                               env->msr_hv_synic_evt_page);
3531             kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
3532                               env->msr_hv_synic_msg_page);
3533 
3534             for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
3535                 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
3536                                   env->msr_hv_synic_sint[j]);
3537             }
3538         }
3539         if (has_msr_hv_stimer) {
3540             int j;
3541 
3542             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
3543                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
3544                                 env->msr_hv_stimer_config[j]);
3545             }
3546 
3547             for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
3548                 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
3549                                 env->msr_hv_stimer_count[j]);
3550             }
3551         }
3552         if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3553             uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
3554 
3555             kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
3556             kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
3557             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
3558             kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
3559             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
3560             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
3561             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
3562             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
3563             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
3564             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
3565             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
3566             kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
3567             for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
3568                 /* The CPU GPs if we write to a bit above the physical limit of
3569                  * the host CPU (and KVM emulates that)
3570                  */
3571                 uint64_t mask = env->mtrr_var[i].mask;
3572                 mask &= phys_mask;
3573 
3574                 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
3575                                   env->mtrr_var[i].base);
3576                 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
3577             }
3578         }
3579         if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
3580             int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
3581                                                     0x14, 1, R_EAX) & 0x7;
3582 
3583             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
3584                             env->msr_rtit_ctrl);
3585             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
3586                             env->msr_rtit_status);
3587             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
3588                             env->msr_rtit_output_base);
3589             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
3590                             env->msr_rtit_output_mask);
3591             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
3592                             env->msr_rtit_cr3_match);
3593             for (i = 0; i < addr_num; i++) {
3594                 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
3595                             env->msr_rtit_addrs[i]);
3596             }
3597         }
3598 
3599         if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
3600             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0,
3601                               env->msr_ia32_sgxlepubkeyhash[0]);
3602             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1,
3603                               env->msr_ia32_sgxlepubkeyhash[1]);
3604             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2,
3605                               env->msr_ia32_sgxlepubkeyhash[2]);
3606             kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3,
3607                               env->msr_ia32_sgxlepubkeyhash[3]);
3608         }
3609 
3610         if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
3611             kvm_msr_entry_add(cpu, MSR_IA32_XFD,
3612                               env->msr_xfd);
3613             kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR,
3614                               env->msr_xfd_err);
3615         }
3616 
3617         if (kvm_enabled() && cpu->enable_pmu &&
3618             (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
3619             uint64_t depth;
3620             int ret;
3621 
3622             /*
3623              * Only migrate Arch LBR states when the host Arch LBR depth
3624              * equals that of source guest's, this is to avoid mismatch
3625              * of guest/host config for the msr hence avoid unexpected
3626              * misbehavior.
3627              */
3628             ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
3629 
3630             if (ret == 1 && !!depth && depth == env->msr_lbr_depth) {
3631                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, env->msr_lbr_ctl);
3632                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, env->msr_lbr_depth);
3633 
3634                 for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
3635                     if (!env->lbr_records[i].from) {
3636                         continue;
3637                     }
3638                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i,
3639                                       env->lbr_records[i].from);
3640                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i,
3641                                       env->lbr_records[i].to);
3642                     kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i,
3643                                       env->lbr_records[i].info);
3644                 }
3645             }
3646         }
3647 
3648         /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
3649          *       kvm_put_msr_feature_control. */
3650     }
3651 
3652     if (env->mcg_cap) {
3653         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
3654         kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
3655         if (has_msr_mcg_ext_ctl) {
3656             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
3657         }
3658         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3659             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
3660         }
3661     }
3662 
3663     return kvm_buf_set_msrs(cpu);
3664 }
3665 
3666 
3667 static int kvm_get_xsave(X86CPU *cpu)
3668 {
3669     CPUX86State *env = &cpu->env;
3670     void *xsave = env->xsave_buf;
3671     int type, ret;
3672 
3673     type = has_xsave2 ? KVM_GET_XSAVE2 : KVM_GET_XSAVE;
3674     ret = kvm_vcpu_ioctl(CPU(cpu), type, xsave);
3675     if (ret < 0) {
3676         return ret;
3677     }
3678     x86_cpu_xrstor_all_areas(cpu, xsave, env->xsave_buf_len);
3679 
3680     return 0;
3681 }
3682 
3683 static int kvm_get_xcrs(X86CPU *cpu)
3684 {
3685     CPUX86State *env = &cpu->env;
3686     int i, ret;
3687     struct kvm_xcrs xcrs;
3688 
3689     if (!has_xcrs) {
3690         return 0;
3691     }
3692 
3693     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
3694     if (ret < 0) {
3695         return ret;
3696     }
3697 
3698     for (i = 0; i < xcrs.nr_xcrs; i++) {
3699         /* Only support xcr0 now */
3700         if (xcrs.xcrs[i].xcr == 0) {
3701             env->xcr0 = xcrs.xcrs[i].value;
3702             break;
3703         }
3704     }
3705     return 0;
3706 }
3707 
3708 static int kvm_get_sregs(X86CPU *cpu)
3709 {
3710     CPUX86State *env = &cpu->env;
3711     struct kvm_sregs sregs;
3712     int ret;
3713 
3714     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
3715     if (ret < 0) {
3716         return ret;
3717     }
3718 
3719     /*
3720      * The interrupt_bitmap is ignored because KVM_GET_SREGS is
3721      * always preceded by KVM_GET_VCPU_EVENTS.
3722      */
3723 
3724     get_seg(&env->segs[R_CS], &sregs.cs);
3725     get_seg(&env->segs[R_DS], &sregs.ds);
3726     get_seg(&env->segs[R_ES], &sregs.es);
3727     get_seg(&env->segs[R_FS], &sregs.fs);
3728     get_seg(&env->segs[R_GS], &sregs.gs);
3729     get_seg(&env->segs[R_SS], &sregs.ss);
3730 
3731     get_seg(&env->tr, &sregs.tr);
3732     get_seg(&env->ldt, &sregs.ldt);
3733 
3734     env->idt.limit = sregs.idt.limit;
3735     env->idt.base = sregs.idt.base;
3736     env->gdt.limit = sregs.gdt.limit;
3737     env->gdt.base = sregs.gdt.base;
3738 
3739     env->cr[0] = sregs.cr0;
3740     env->cr[2] = sregs.cr2;
3741     env->cr[3] = sregs.cr3;
3742     env->cr[4] = sregs.cr4;
3743 
3744     env->efer = sregs.efer;
3745     if (sev_es_enabled() && env->efer & MSR_EFER_LME &&
3746         env->cr[0] & CR0_PG_MASK) {
3747         env->efer |= MSR_EFER_LMA;
3748     }
3749 
3750     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3751     x86_update_hflags(env);
3752 
3753     return 0;
3754 }
3755 
3756 static int kvm_get_sregs2(X86CPU *cpu)
3757 {
3758     CPUX86State *env = &cpu->env;
3759     struct kvm_sregs2 sregs;
3760     int i, ret;
3761 
3762     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS2, &sregs);
3763     if (ret < 0) {
3764         return ret;
3765     }
3766 
3767     get_seg(&env->segs[R_CS], &sregs.cs);
3768     get_seg(&env->segs[R_DS], &sregs.ds);
3769     get_seg(&env->segs[R_ES], &sregs.es);
3770     get_seg(&env->segs[R_FS], &sregs.fs);
3771     get_seg(&env->segs[R_GS], &sregs.gs);
3772     get_seg(&env->segs[R_SS], &sregs.ss);
3773 
3774     get_seg(&env->tr, &sregs.tr);
3775     get_seg(&env->ldt, &sregs.ldt);
3776 
3777     env->idt.limit = sregs.idt.limit;
3778     env->idt.base = sregs.idt.base;
3779     env->gdt.limit = sregs.gdt.limit;
3780     env->gdt.base = sregs.gdt.base;
3781 
3782     env->cr[0] = sregs.cr0;
3783     env->cr[2] = sregs.cr2;
3784     env->cr[3] = sregs.cr3;
3785     env->cr[4] = sregs.cr4;
3786 
3787     env->efer = sregs.efer;
3788     if (sev_es_enabled() && env->efer & MSR_EFER_LME &&
3789         env->cr[0] & CR0_PG_MASK) {
3790         env->efer |= MSR_EFER_LMA;
3791     }
3792 
3793     env->pdptrs_valid = sregs.flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
3794 
3795     if (env->pdptrs_valid) {
3796         for (i = 0; i < 4; i++) {
3797             env->pdptrs[i] = sregs.pdptrs[i];
3798         }
3799     }
3800 
3801     /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
3802     x86_update_hflags(env);
3803 
3804     return 0;
3805 }
3806 
3807 static int kvm_get_msrs(X86CPU *cpu)
3808 {
3809     CPUX86State *env = &cpu->env;
3810     struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
3811     int ret, i;
3812     uint64_t mtrr_top_bits;
3813 
3814     kvm_msr_buf_reset(cpu);
3815 
3816     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
3817     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
3818     kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
3819     kvm_msr_entry_add(cpu, MSR_PAT, 0);
3820     if (has_msr_star) {
3821         kvm_msr_entry_add(cpu, MSR_STAR, 0);
3822     }
3823     if (has_msr_hsave_pa) {
3824         kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
3825     }
3826     if (has_msr_tsc_aux) {
3827         kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
3828     }
3829     if (has_msr_tsc_adjust) {
3830         kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
3831     }
3832     if (has_msr_tsc_deadline) {
3833         kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
3834     }
3835     if (has_msr_misc_enable) {
3836         kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
3837     }
3838     if (has_msr_smbase) {
3839         kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
3840     }
3841     if (has_msr_smi_count) {
3842         kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
3843     }
3844     if (has_msr_feature_control) {
3845         kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
3846     }
3847     if (has_msr_pkrs) {
3848         kvm_msr_entry_add(cpu, MSR_IA32_PKRS, 0);
3849     }
3850     if (has_msr_bndcfgs) {
3851         kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
3852     }
3853     if (has_msr_xss) {
3854         kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
3855     }
3856     if (has_msr_umwait) {
3857         kvm_msr_entry_add(cpu, MSR_IA32_UMWAIT_CONTROL, 0);
3858     }
3859     if (has_msr_spec_ctrl) {
3860         kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
3861     }
3862     if (has_tsc_scale_msr) {
3863         kvm_msr_entry_add(cpu, MSR_AMD64_TSC_RATIO, 0);
3864     }
3865 
3866     if (has_msr_tsx_ctrl) {
3867         kvm_msr_entry_add(cpu, MSR_IA32_TSX_CTRL, 0);
3868     }
3869     if (has_msr_virt_ssbd) {
3870         kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
3871     }
3872     if (!env->tsc_valid) {
3873         kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
3874         env->tsc_valid = !runstate_is_running();
3875     }
3876 
3877 #ifdef TARGET_X86_64
3878     if (lm_capable_kernel) {
3879         kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
3880         kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
3881         kvm_msr_entry_add(cpu, MSR_FMASK, 0);
3882         kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
3883         if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_FRED) {
3884             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP0, 0);
3885             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP1, 0);
3886             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP2, 0);
3887             kvm_msr_entry_add(cpu, MSR_IA32_FRED_RSP3, 0);
3888             kvm_msr_entry_add(cpu, MSR_IA32_FRED_STKLVLS, 0);
3889             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP1, 0);
3890             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP2, 0);
3891             kvm_msr_entry_add(cpu, MSR_IA32_FRED_SSP3, 0);
3892             kvm_msr_entry_add(cpu, MSR_IA32_FRED_CONFIG, 0);
3893         }
3894     }
3895 #endif
3896     kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
3897     kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
3898     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF_INT)) {
3899         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_INT, 0);
3900     }
3901     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
3902         kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
3903     }
3904     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
3905         kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
3906     }
3907     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
3908         kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
3909     }
3910     if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_POLL_CONTROL)) {
3911         kvm_msr_entry_add(cpu, MSR_KVM_POLL_CONTROL, 1);
3912     }
3913     if (has_architectural_pmu_version > 0) {
3914         if (has_architectural_pmu_version > 1) {
3915             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
3916             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
3917             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
3918             kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
3919         }
3920         for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
3921             kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
3922         }
3923         for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
3924             kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
3925             kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
3926         }
3927     }
3928 
3929     if (env->mcg_cap) {
3930         kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
3931         kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
3932         if (has_msr_mcg_ext_ctl) {
3933             kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
3934         }
3935         for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
3936             kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
3937         }
3938     }
3939 
3940     if (has_msr_hv_hypercall) {
3941         kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
3942         kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
3943     }
3944     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
3945         kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
3946     }
3947     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
3948         kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
3949     }
3950     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
3951         kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
3952         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
3953         kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
3954     }
3955     if (has_msr_hv_syndbg_options) {
3956         kvm_msr_entry_add(cpu, HV_X64_MSR_SYNDBG_OPTIONS, 0);
3957     }
3958     if (has_msr_hv_crash) {
3959         int j;
3960 
3961         for (j = 0; j < HV_CRASH_PARAMS; j++) {
3962             kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
3963         }
3964     }
3965     if (has_msr_hv_runtime) {
3966         kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
3967     }
3968     if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
3969         uint32_t msr;
3970 
3971         kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
3972         kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
3973         kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
3974         for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
3975             kvm_msr_entry_add(cpu, msr, 0);
3976         }
3977     }
3978     if (has_msr_hv_stimer) {
3979         uint32_t msr;
3980 
3981         for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
3982              msr++) {
3983             kvm_msr_entry_add(cpu, msr, 0);
3984         }
3985     }
3986     if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
3987         kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
3988         kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
3989         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
3990         kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
3991         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
3992         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
3993         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
3994         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
3995         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
3996         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
3997         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
3998         kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
3999         for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
4000             kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
4001             kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
4002         }
4003     }
4004 
4005     if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
4006         int addr_num =
4007             kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
4008 
4009         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
4010         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
4011         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
4012         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
4013         kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
4014         for (i = 0; i < addr_num; i++) {
4015             kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
4016         }
4017     }
4018 
4019     if (env->features[FEAT_7_0_ECX] & CPUID_7_0_ECX_SGX_LC) {
4020         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH0, 0);
4021         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH1, 0);
4022         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH2, 0);
4023         kvm_msr_entry_add(cpu, MSR_IA32_SGXLEPUBKEYHASH3, 0);
4024     }
4025 
4026     if (env->features[FEAT_XSAVE] & CPUID_D_1_EAX_XFD) {
4027         kvm_msr_entry_add(cpu, MSR_IA32_XFD, 0);
4028         kvm_msr_entry_add(cpu, MSR_IA32_XFD_ERR, 0);
4029     }
4030 
4031     if (kvm_enabled() && cpu->enable_pmu &&
4032         (env->features[FEAT_7_0_EDX] & CPUID_7_0_EDX_ARCH_LBR)) {
4033         uint64_t depth;
4034 
4035         ret = kvm_get_one_msr(cpu, MSR_ARCH_LBR_DEPTH, &depth);
4036         if (ret == 1 && depth == ARCH_LBR_NR_ENTRIES) {
4037             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_CTL, 0);
4038             kvm_msr_entry_add(cpu, MSR_ARCH_LBR_DEPTH, 0);
4039 
4040             for (i = 0; i < ARCH_LBR_NR_ENTRIES; i++) {
4041                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_FROM_0 + i, 0);
4042                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_TO_0 + i, 0);
4043                 kvm_msr_entry_add(cpu, MSR_ARCH_LBR_INFO_0 + i, 0);
4044             }
4045         }
4046     }
4047 
4048     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
4049     if (ret < 0) {
4050         return ret;
4051     }
4052 
4053     if (ret < cpu->kvm_msr_buf->nmsrs) {
4054         struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
4055         error_report("error: failed to get MSR 0x%" PRIx32,
4056                      (uint32_t)e->index);
4057     }
4058 
4059     assert(ret == cpu->kvm_msr_buf->nmsrs);
4060     /*
4061      * MTRR masks: Each mask consists of 5 parts
4062      * a  10..0: must be zero
4063      * b  11   : valid bit
4064      * c n-1.12: actual mask bits
4065      * d  51..n: reserved must be zero
4066      * e  63.52: reserved must be zero
4067      *
4068      * 'n' is the number of physical bits supported by the CPU and is
4069      * apparently always <= 52.   We know our 'n' but don't know what
4070      * the destinations 'n' is; it might be smaller, in which case
4071      * it masks (c) on loading. It might be larger, in which case
4072      * we fill 'd' so that d..c is consistent irrespetive of the 'n'
4073      * we're migrating to.
4074      */
4075 
4076     if (cpu->fill_mtrr_mask) {
4077         QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
4078         assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
4079         mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
4080     } else {
4081         mtrr_top_bits = 0;
4082     }
4083 
4084     for (i = 0; i < ret; i++) {
4085         uint32_t index = msrs[i].index;
4086         switch (index) {
4087         case MSR_IA32_SYSENTER_CS:
4088             env->sysenter_cs = msrs[i].data;
4089             break;
4090         case MSR_IA32_SYSENTER_ESP:
4091             env->sysenter_esp = msrs[i].data;
4092             break;
4093         case MSR_IA32_SYSENTER_EIP:
4094             env->sysenter_eip = msrs[i].data;
4095             break;
4096         case MSR_PAT:
4097             env->pat = msrs[i].data;
4098             break;
4099         case MSR_STAR:
4100             env->star = msrs[i].data;
4101             break;
4102 #ifdef TARGET_X86_64
4103         case MSR_CSTAR:
4104             env->cstar = msrs[i].data;
4105             break;
4106         case MSR_KERNELGSBASE:
4107             env->kernelgsbase = msrs[i].data;
4108             break;
4109         case MSR_FMASK:
4110             env->fmask = msrs[i].data;
4111             break;
4112         case MSR_LSTAR:
4113             env->lstar = msrs[i].data;
4114             break;
4115         case MSR_IA32_FRED_RSP0:
4116             env->fred_rsp0 = msrs[i].data;
4117             break;
4118         case MSR_IA32_FRED_RSP1:
4119             env->fred_rsp1 = msrs[i].data;
4120             break;
4121         case MSR_IA32_FRED_RSP2:
4122             env->fred_rsp2 = msrs[i].data;
4123             break;
4124         case MSR_IA32_FRED_RSP3:
4125             env->fred_rsp3 = msrs[i].data;
4126             break;
4127         case MSR_IA32_FRED_STKLVLS:
4128             env->fred_stklvls = msrs[i].data;
4129             break;
4130         case MSR_IA32_FRED_SSP1:
4131             env->fred_ssp1 = msrs[i].data;
4132             break;
4133         case MSR_IA32_FRED_SSP2:
4134             env->fred_ssp2 = msrs[i].data;
4135             break;
4136         case MSR_IA32_FRED_SSP3:
4137             env->fred_ssp3 = msrs[i].data;
4138             break;
4139         case MSR_IA32_FRED_CONFIG:
4140             env->fred_config = msrs[i].data;
4141             break;
4142 #endif
4143         case MSR_IA32_TSC:
4144             env->tsc = msrs[i].data;
4145             break;
4146         case MSR_TSC_AUX:
4147             env->tsc_aux = msrs[i].data;
4148             break;
4149         case MSR_TSC_ADJUST:
4150             env->tsc_adjust = msrs[i].data;
4151             break;
4152         case MSR_IA32_TSCDEADLINE:
4153             env->tsc_deadline = msrs[i].data;
4154             break;
4155         case MSR_VM_HSAVE_PA:
4156             env->vm_hsave = msrs[i].data;
4157             break;
4158         case MSR_KVM_SYSTEM_TIME:
4159             env->system_time_msr = msrs[i].data;
4160             break;
4161         case MSR_KVM_WALL_CLOCK:
4162             env->wall_clock_msr = msrs[i].data;
4163             break;
4164         case MSR_MCG_STATUS:
4165             env->mcg_status = msrs[i].data;
4166             break;
4167         case MSR_MCG_CTL:
4168             env->mcg_ctl = msrs[i].data;
4169             break;
4170         case MSR_MCG_EXT_CTL:
4171             env->mcg_ext_ctl = msrs[i].data;
4172             break;
4173         case MSR_IA32_MISC_ENABLE:
4174             env->msr_ia32_misc_enable = msrs[i].data;
4175             break;
4176         case MSR_IA32_SMBASE:
4177             env->smbase = msrs[i].data;
4178             break;
4179         case MSR_SMI_COUNT:
4180             env->msr_smi_count = msrs[i].data;
4181             break;
4182         case MSR_IA32_FEATURE_CONTROL:
4183             env->msr_ia32_feature_control = msrs[i].data;
4184             break;
4185         case MSR_IA32_BNDCFGS:
4186             env->msr_bndcfgs = msrs[i].data;
4187             break;
4188         case MSR_IA32_XSS:
4189             env->xss = msrs[i].data;
4190             break;
4191         case MSR_IA32_UMWAIT_CONTROL:
4192             env->umwait = msrs[i].data;
4193             break;
4194         case MSR_IA32_PKRS:
4195             env->pkrs = msrs[i].data;
4196             break;
4197         default:
4198             if (msrs[i].index >= MSR_MC0_CTL &&
4199                 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
4200                 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
4201             }
4202             break;
4203         case MSR_KVM_ASYNC_PF_EN:
4204             env->async_pf_en_msr = msrs[i].data;
4205             break;
4206         case MSR_KVM_ASYNC_PF_INT:
4207             env->async_pf_int_msr = msrs[i].data;
4208             break;
4209         case MSR_KVM_PV_EOI_EN:
4210             env->pv_eoi_en_msr = msrs[i].data;
4211             break;
4212         case MSR_KVM_STEAL_TIME:
4213             env->steal_time_msr = msrs[i].data;
4214             break;
4215         case MSR_KVM_POLL_CONTROL: {
4216             env->poll_control_msr = msrs[i].data;
4217             break;
4218         }
4219         case MSR_CORE_PERF_FIXED_CTR_CTRL:
4220             env->msr_fixed_ctr_ctrl = msrs[i].data;
4221             break;
4222         case MSR_CORE_PERF_GLOBAL_CTRL:
4223             env->msr_global_ctrl = msrs[i].data;
4224             break;
4225         case MSR_CORE_PERF_GLOBAL_STATUS:
4226             env->msr_global_status = msrs[i].data;
4227             break;
4228         case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
4229             env->msr_global_ovf_ctrl = msrs[i].data;
4230             break;
4231         case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
4232             env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
4233             break;
4234         case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
4235             env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
4236             break;
4237         case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
4238             env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
4239             break;
4240         case HV_X64_MSR_HYPERCALL:
4241             env->msr_hv_hypercall = msrs[i].data;
4242             break;
4243         case HV_X64_MSR_GUEST_OS_ID:
4244             env->msr_hv_guest_os_id = msrs[i].data;
4245             break;
4246         case HV_X64_MSR_APIC_ASSIST_PAGE:
4247             env->msr_hv_vapic = msrs[i].data;
4248             break;
4249         case HV_X64_MSR_REFERENCE_TSC:
4250             env->msr_hv_tsc = msrs[i].data;
4251             break;
4252         case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
4253             env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
4254             break;
4255         case HV_X64_MSR_VP_RUNTIME:
4256             env->msr_hv_runtime = msrs[i].data;
4257             break;
4258         case HV_X64_MSR_SCONTROL:
4259             env->msr_hv_synic_control = msrs[i].data;
4260             break;
4261         case HV_X64_MSR_SIEFP:
4262             env->msr_hv_synic_evt_page = msrs[i].data;
4263             break;
4264         case HV_X64_MSR_SIMP:
4265             env->msr_hv_synic_msg_page = msrs[i].data;
4266             break;
4267         case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
4268             env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
4269             break;
4270         case HV_X64_MSR_STIMER0_CONFIG:
4271         case HV_X64_MSR_STIMER1_CONFIG:
4272         case HV_X64_MSR_STIMER2_CONFIG:
4273         case HV_X64_MSR_STIMER3_CONFIG:
4274             env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
4275                                 msrs[i].data;
4276             break;
4277         case HV_X64_MSR_STIMER0_COUNT:
4278         case HV_X64_MSR_STIMER1_COUNT:
4279         case HV_X64_MSR_STIMER2_COUNT:
4280         case HV_X64_MSR_STIMER3_COUNT:
4281             env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
4282                                 msrs[i].data;
4283             break;
4284         case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
4285             env->msr_hv_reenlightenment_control = msrs[i].data;
4286             break;
4287         case HV_X64_MSR_TSC_EMULATION_CONTROL:
4288             env->msr_hv_tsc_emulation_control = msrs[i].data;
4289             break;
4290         case HV_X64_MSR_TSC_EMULATION_STATUS:
4291             env->msr_hv_tsc_emulation_status = msrs[i].data;
4292             break;
4293         case HV_X64_MSR_SYNDBG_OPTIONS:
4294             env->msr_hv_syndbg_options = msrs[i].data;
4295             break;
4296         case MSR_MTRRdefType:
4297             env->mtrr_deftype = msrs[i].data;
4298             break;
4299         case MSR_MTRRfix64K_00000:
4300             env->mtrr_fixed[0] = msrs[i].data;
4301             break;
4302         case MSR_MTRRfix16K_80000:
4303             env->mtrr_fixed[1] = msrs[i].data;
4304             break;
4305         case MSR_MTRRfix16K_A0000:
4306             env->mtrr_fixed[2] = msrs[i].data;
4307             break;
4308         case MSR_MTRRfix4K_C0000:
4309             env->mtrr_fixed[3] = msrs[i].data;
4310             break;
4311         case MSR_MTRRfix4K_C8000:
4312             env->mtrr_fixed[4] = msrs[i].data;
4313             break;
4314         case MSR_MTRRfix4K_D0000:
4315             env->mtrr_fixed[5] = msrs[i].data;
4316             break;
4317         case MSR_MTRRfix4K_D8000:
4318             env->mtrr_fixed[6] = msrs[i].data;
4319             break;
4320         case MSR_MTRRfix4K_E0000:
4321             env->mtrr_fixed[7] = msrs[i].data;
4322             break;
4323         case MSR_MTRRfix4K_E8000:
4324             env->mtrr_fixed[8] = msrs[i].data;
4325             break;
4326         case MSR_MTRRfix4K_F0000:
4327             env->mtrr_fixed[9] = msrs[i].data;
4328             break;
4329         case MSR_MTRRfix4K_F8000:
4330             env->mtrr_fixed[10] = msrs[i].data;
4331             break;
4332         case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
4333             if (index & 1) {
4334                 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
4335                                                                mtrr_top_bits;
4336             } else {
4337                 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
4338             }
4339             break;
4340         case MSR_IA32_SPEC_CTRL:
4341             env->spec_ctrl = msrs[i].data;
4342             break;
4343         case MSR_AMD64_TSC_RATIO:
4344             env->amd_tsc_scale_msr = msrs[i].data;
4345             break;
4346         case MSR_IA32_TSX_CTRL:
4347             env->tsx_ctrl = msrs[i].data;
4348             break;
4349         case MSR_VIRT_SSBD:
4350             env->virt_ssbd = msrs[i].data;
4351             break;
4352         case MSR_IA32_RTIT_CTL:
4353             env->msr_rtit_ctrl = msrs[i].data;
4354             break;
4355         case MSR_IA32_RTIT_STATUS:
4356             env->msr_rtit_status = msrs[i].data;
4357             break;
4358         case MSR_IA32_RTIT_OUTPUT_BASE:
4359             env->msr_rtit_output_base = msrs[i].data;
4360             break;
4361         case MSR_IA32_RTIT_OUTPUT_MASK:
4362             env->msr_rtit_output_mask = msrs[i].data;
4363             break;
4364         case MSR_IA32_RTIT_CR3_MATCH:
4365             env->msr_rtit_cr3_match = msrs[i].data;
4366             break;
4367         case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
4368             env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
4369             break;
4370         case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
4371             env->msr_ia32_sgxlepubkeyhash[index - MSR_IA32_SGXLEPUBKEYHASH0] =
4372                            msrs[i].data;
4373             break;
4374         case MSR_IA32_XFD:
4375             env->msr_xfd = msrs[i].data;
4376             break;
4377         case MSR_IA32_XFD_ERR:
4378             env->msr_xfd_err = msrs[i].data;
4379             break;
4380         case MSR_ARCH_LBR_CTL:
4381             env->msr_lbr_ctl = msrs[i].data;
4382             break;
4383         case MSR_ARCH_LBR_DEPTH:
4384             env->msr_lbr_depth = msrs[i].data;
4385             break;
4386         case MSR_ARCH_LBR_FROM_0 ... MSR_ARCH_LBR_FROM_0 + 31:
4387             env->lbr_records[index - MSR_ARCH_LBR_FROM_0].from = msrs[i].data;
4388             break;
4389         case MSR_ARCH_LBR_TO_0 ... MSR_ARCH_LBR_TO_0 + 31:
4390             env->lbr_records[index - MSR_ARCH_LBR_TO_0].to = msrs[i].data;
4391             break;
4392         case MSR_ARCH_LBR_INFO_0 ... MSR_ARCH_LBR_INFO_0 + 31:
4393             env->lbr_records[index - MSR_ARCH_LBR_INFO_0].info = msrs[i].data;
4394             break;
4395         }
4396     }
4397 
4398     return 0;
4399 }
4400 
4401 static int kvm_put_mp_state(X86CPU *cpu)
4402 {
4403     struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
4404 
4405     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
4406 }
4407 
4408 static int kvm_get_mp_state(X86CPU *cpu)
4409 {
4410     CPUState *cs = CPU(cpu);
4411     CPUX86State *env = &cpu->env;
4412     struct kvm_mp_state mp_state;
4413     int ret;
4414 
4415     ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
4416     if (ret < 0) {
4417         return ret;
4418     }
4419     env->mp_state = mp_state.mp_state;
4420     if (kvm_irqchip_in_kernel()) {
4421         cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
4422     }
4423     return 0;
4424 }
4425 
4426 static int kvm_get_apic(X86CPU *cpu)
4427 {
4428     DeviceState *apic = cpu->apic_state;
4429     struct kvm_lapic_state kapic;
4430     int ret;
4431 
4432     if (apic && kvm_irqchip_in_kernel()) {
4433         ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
4434         if (ret < 0) {
4435             return ret;
4436         }
4437 
4438         kvm_get_apic_state(apic, &kapic);
4439     }
4440     return 0;
4441 }
4442 
4443 static int kvm_put_vcpu_events(X86CPU *cpu, int level)
4444 {
4445     CPUState *cs = CPU(cpu);
4446     CPUX86State *env = &cpu->env;
4447     struct kvm_vcpu_events events = {};
4448 
4449     events.flags = 0;
4450 
4451     if (has_exception_payload) {
4452         events.flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4453         events.exception.pending = env->exception_pending;
4454         events.exception_has_payload = env->exception_has_payload;
4455         events.exception_payload = env->exception_payload;
4456     }
4457     events.exception.nr = env->exception_nr;
4458     events.exception.injected = env->exception_injected;
4459     events.exception.has_error_code = env->has_error_code;
4460     events.exception.error_code = env->error_code;
4461 
4462     events.interrupt.injected = (env->interrupt_injected >= 0);
4463     events.interrupt.nr = env->interrupt_injected;
4464     events.interrupt.soft = env->soft_interrupt;
4465 
4466     events.nmi.injected = env->nmi_injected;
4467     events.nmi.pending = env->nmi_pending;
4468     events.nmi.masked = !!(env->hflags2 & HF2_NMI_MASK);
4469 
4470     events.sipi_vector = env->sipi_vector;
4471 
4472     if (has_msr_smbase) {
4473         events.flags |= KVM_VCPUEVENT_VALID_SMM;
4474         events.smi.smm = !!(env->hflags & HF_SMM_MASK);
4475         events.smi.smm_inside_nmi = !!(env->hflags2 & HF2_SMM_INSIDE_NMI_MASK);
4476         if (kvm_irqchip_in_kernel()) {
4477             /* As soon as these are moved to the kernel, remove them
4478              * from cs->interrupt_request.
4479              */
4480             events.smi.pending = cs->interrupt_request & CPU_INTERRUPT_SMI;
4481             events.smi.latched_init = cs->interrupt_request & CPU_INTERRUPT_INIT;
4482             cs->interrupt_request &= ~(CPU_INTERRUPT_INIT | CPU_INTERRUPT_SMI);
4483         } else {
4484             /* Keep these in cs->interrupt_request.  */
4485             events.smi.pending = 0;
4486             events.smi.latched_init = 0;
4487         }
4488     }
4489 
4490     if (level >= KVM_PUT_RESET_STATE) {
4491         events.flags |= KVM_VCPUEVENT_VALID_NMI_PENDING;
4492         if (env->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
4493             events.flags |= KVM_VCPUEVENT_VALID_SIPI_VECTOR;
4494         }
4495     }
4496 
4497     if (has_triple_fault_event) {
4498         events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
4499         events.triple_fault.pending = env->triple_fault_pending;
4500     }
4501 
4502     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events);
4503 }
4504 
4505 static int kvm_get_vcpu_events(X86CPU *cpu)
4506 {
4507     CPUX86State *env = &cpu->env;
4508     struct kvm_vcpu_events events;
4509     int ret;
4510 
4511     memset(&events, 0, sizeof(events));
4512     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_VCPU_EVENTS, &events);
4513     if (ret < 0) {
4514        return ret;
4515     }
4516 
4517     if (events.flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4518         env->exception_pending = events.exception.pending;
4519         env->exception_has_payload = events.exception_has_payload;
4520         env->exception_payload = events.exception_payload;
4521     } else {
4522         env->exception_pending = 0;
4523         env->exception_has_payload = false;
4524     }
4525     env->exception_injected = events.exception.injected;
4526     env->exception_nr =
4527         (env->exception_pending || env->exception_injected) ?
4528         events.exception.nr : -1;
4529     env->has_error_code = events.exception.has_error_code;
4530     env->error_code = events.exception.error_code;
4531 
4532     env->interrupt_injected =
4533         events.interrupt.injected ? events.interrupt.nr : -1;
4534     env->soft_interrupt = events.interrupt.soft;
4535 
4536     env->nmi_injected = events.nmi.injected;
4537     env->nmi_pending = events.nmi.pending;
4538     if (events.nmi.masked) {
4539         env->hflags2 |= HF2_NMI_MASK;
4540     } else {
4541         env->hflags2 &= ~HF2_NMI_MASK;
4542     }
4543 
4544     if (events.flags & KVM_VCPUEVENT_VALID_SMM) {
4545         if (events.smi.smm) {
4546             env->hflags |= HF_SMM_MASK;
4547         } else {
4548             env->hflags &= ~HF_SMM_MASK;
4549         }
4550         if (events.smi.pending) {
4551             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4552         } else {
4553             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_SMI);
4554         }
4555         if (events.smi.smm_inside_nmi) {
4556             env->hflags2 |= HF2_SMM_INSIDE_NMI_MASK;
4557         } else {
4558             env->hflags2 &= ~HF2_SMM_INSIDE_NMI_MASK;
4559         }
4560         if (events.smi.latched_init) {
4561             cpu_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4562         } else {
4563             cpu_reset_interrupt(CPU(cpu), CPU_INTERRUPT_INIT);
4564         }
4565     }
4566 
4567     if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
4568         env->triple_fault_pending = events.triple_fault.pending;
4569     }
4570 
4571     env->sipi_vector = events.sipi_vector;
4572 
4573     return 0;
4574 }
4575 
4576 static int kvm_put_debugregs(X86CPU *cpu)
4577 {
4578     CPUX86State *env = &cpu->env;
4579     struct kvm_debugregs dbgregs;
4580     int i;
4581 
4582     memset(&dbgregs, 0, sizeof(dbgregs));
4583     for (i = 0; i < 4; i++) {
4584         dbgregs.db[i] = env->dr[i];
4585     }
4586     dbgregs.dr6 = env->dr[6];
4587     dbgregs.dr7 = env->dr[7];
4588     dbgregs.flags = 0;
4589 
4590     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_DEBUGREGS, &dbgregs);
4591 }
4592 
4593 static int kvm_get_debugregs(X86CPU *cpu)
4594 {
4595     CPUX86State *env = &cpu->env;
4596     struct kvm_debugregs dbgregs;
4597     int i, ret;
4598 
4599     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_DEBUGREGS, &dbgregs);
4600     if (ret < 0) {
4601         return ret;
4602     }
4603     for (i = 0; i < 4; i++) {
4604         env->dr[i] = dbgregs.db[i];
4605     }
4606     env->dr[4] = env->dr[6] = dbgregs.dr6;
4607     env->dr[5] = env->dr[7] = dbgregs.dr7;
4608 
4609     return 0;
4610 }
4611 
4612 static int kvm_put_nested_state(X86CPU *cpu)
4613 {
4614     CPUX86State *env = &cpu->env;
4615     int max_nested_state_len = kvm_max_nested_state_length();
4616 
4617     if (!env->nested_state) {
4618         return 0;
4619     }
4620 
4621     /*
4622      * Copy flags that are affected by reset from env->hflags and env->hflags2.
4623      */
4624     if (env->hflags & HF_GUEST_MASK) {
4625         env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
4626     } else {
4627         env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
4628     }
4629 
4630     /* Don't set KVM_STATE_NESTED_GIF_SET on VMX as it is illegal */
4631     if (cpu_has_svm(env) && (env->hflags2 & HF2_GIF_MASK)) {
4632         env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
4633     } else {
4634         env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
4635     }
4636 
4637     assert(env->nested_state->size <= max_nested_state_len);
4638     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
4639 }
4640 
4641 static int kvm_get_nested_state(X86CPU *cpu)
4642 {
4643     CPUX86State *env = &cpu->env;
4644     int max_nested_state_len = kvm_max_nested_state_length();
4645     int ret;
4646 
4647     if (!env->nested_state) {
4648         return 0;
4649     }
4650 
4651     /*
4652      * It is possible that migration restored a smaller size into
4653      * nested_state->hdr.size than what our kernel support.
4654      * We preserve migration origin nested_state->hdr.size for
4655      * call to KVM_SET_NESTED_STATE but wish that our next call
4656      * to KVM_GET_NESTED_STATE will use max size our kernel support.
4657      */
4658     env->nested_state->size = max_nested_state_len;
4659 
4660     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_NESTED_STATE, env->nested_state);
4661     if (ret < 0) {
4662         return ret;
4663     }
4664 
4665     /*
4666      * Copy flags that are affected by reset to env->hflags and env->hflags2.
4667      */
4668     if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
4669         env->hflags |= HF_GUEST_MASK;
4670     } else {
4671         env->hflags &= ~HF_GUEST_MASK;
4672     }
4673 
4674     /* Keep HF2_GIF_MASK set on !SVM as x86_cpu_pending_interrupt() needs it */
4675     if (cpu_has_svm(env)) {
4676         if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
4677             env->hflags2 |= HF2_GIF_MASK;
4678         } else {
4679             env->hflags2 &= ~HF2_GIF_MASK;
4680         }
4681     }
4682 
4683     return ret;
4684 }
4685 
4686 int kvm_arch_put_registers(CPUState *cpu, int level)
4687 {
4688     X86CPU *x86_cpu = X86_CPU(cpu);
4689     int ret;
4690 
4691     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
4692 
4693     /*
4694      * Put MSR_IA32_FEATURE_CONTROL first, this ensures the VM gets out of VMX
4695      * root operation upon vCPU reset. kvm_put_msr_feature_control() should also
4696      * precede kvm_put_nested_state() when 'real' nested state is set.
4697      */
4698     if (level >= KVM_PUT_RESET_STATE) {
4699         ret = kvm_put_msr_feature_control(x86_cpu);
4700         if (ret < 0) {
4701             return ret;
4702         }
4703     }
4704 
4705     /* must be before kvm_put_nested_state so that EFER.SVME is set */
4706     ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu);
4707     if (ret < 0) {
4708         return ret;
4709     }
4710 
4711     if (level >= KVM_PUT_RESET_STATE) {
4712         ret = kvm_put_nested_state(x86_cpu);
4713         if (ret < 0) {
4714             return ret;
4715         }
4716     }
4717 
4718     if (level == KVM_PUT_FULL_STATE) {
4719         /* We don't check for kvm_arch_set_tsc_khz() errors here,
4720          * because TSC frequency mismatch shouldn't abort migration,
4721          * unless the user explicitly asked for a more strict TSC
4722          * setting (e.g. using an explicit "tsc-freq" option).
4723          */
4724         kvm_arch_set_tsc_khz(cpu);
4725     }
4726 
4727 #ifdef CONFIG_XEN_EMU
4728     if (xen_mode == XEN_EMULATE && level == KVM_PUT_FULL_STATE) {
4729         ret = kvm_put_xen_state(cpu);
4730         if (ret < 0) {
4731             return ret;
4732         }
4733     }
4734 #endif
4735 
4736     ret = kvm_getput_regs(x86_cpu, 1);
4737     if (ret < 0) {
4738         return ret;
4739     }
4740     ret = kvm_put_xsave(x86_cpu);
4741     if (ret < 0) {
4742         return ret;
4743     }
4744     ret = kvm_put_xcrs(x86_cpu);
4745     if (ret < 0) {
4746         return ret;
4747     }
4748     ret = kvm_put_msrs(x86_cpu, level);
4749     if (ret < 0) {
4750         return ret;
4751     }
4752     ret = kvm_put_vcpu_events(x86_cpu, level);
4753     if (ret < 0) {
4754         return ret;
4755     }
4756     if (level >= KVM_PUT_RESET_STATE) {
4757         ret = kvm_put_mp_state(x86_cpu);
4758         if (ret < 0) {
4759             return ret;
4760         }
4761     }
4762 
4763     ret = kvm_put_tscdeadline_msr(x86_cpu);
4764     if (ret < 0) {
4765         return ret;
4766     }
4767     ret = kvm_put_debugregs(x86_cpu);
4768     if (ret < 0) {
4769         return ret;
4770     }
4771     return 0;
4772 }
4773 
4774 int kvm_arch_get_registers(CPUState *cs)
4775 {
4776     X86CPU *cpu = X86_CPU(cs);
4777     int ret;
4778 
4779     assert(cpu_is_stopped(cs) || qemu_cpu_is_self(cs));
4780 
4781     ret = kvm_get_vcpu_events(cpu);
4782     if (ret < 0) {
4783         goto out;
4784     }
4785     /*
4786      * KVM_GET_MPSTATE can modify CS and RIP, call it before
4787      * KVM_GET_REGS and KVM_GET_SREGS.
4788      */
4789     ret = kvm_get_mp_state(cpu);
4790     if (ret < 0) {
4791         goto out;
4792     }
4793     ret = kvm_getput_regs(cpu, 0);
4794     if (ret < 0) {
4795         goto out;
4796     }
4797     ret = kvm_get_xsave(cpu);
4798     if (ret < 0) {
4799         goto out;
4800     }
4801     ret = kvm_get_xcrs(cpu);
4802     if (ret < 0) {
4803         goto out;
4804     }
4805     ret = has_sregs2 ? kvm_get_sregs2(cpu) : kvm_get_sregs(cpu);
4806     if (ret < 0) {
4807         goto out;
4808     }
4809     ret = kvm_get_msrs(cpu);
4810     if (ret < 0) {
4811         goto out;
4812     }
4813     ret = kvm_get_apic(cpu);
4814     if (ret < 0) {
4815         goto out;
4816     }
4817     ret = kvm_get_debugregs(cpu);
4818     if (ret < 0) {
4819         goto out;
4820     }
4821     ret = kvm_get_nested_state(cpu);
4822     if (ret < 0) {
4823         goto out;
4824     }
4825 #ifdef CONFIG_XEN_EMU
4826     if (xen_mode == XEN_EMULATE) {
4827         ret = kvm_get_xen_state(cs);
4828         if (ret < 0) {
4829             goto out;
4830         }
4831     }
4832 #endif
4833     ret = 0;
4834  out:
4835     cpu_sync_bndcs_hflags(&cpu->env);
4836     return ret;
4837 }
4838 
4839 void kvm_arch_pre_run(CPUState *cpu, struct kvm_run *run)
4840 {
4841     X86CPU *x86_cpu = X86_CPU(cpu);
4842     CPUX86State *env = &x86_cpu->env;
4843     int ret;
4844 
4845     /* Inject NMI */
4846     if (cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
4847         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
4848             bql_lock();
4849             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
4850             bql_unlock();
4851             DPRINTF("injected NMI\n");
4852             ret = kvm_vcpu_ioctl(cpu, KVM_NMI);
4853             if (ret < 0) {
4854                 fprintf(stderr, "KVM: injection failed, NMI lost (%s)\n",
4855                         strerror(-ret));
4856             }
4857         }
4858         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
4859             bql_lock();
4860             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
4861             bql_unlock();
4862             DPRINTF("injected SMI\n");
4863             ret = kvm_vcpu_ioctl(cpu, KVM_SMI);
4864             if (ret < 0) {
4865                 fprintf(stderr, "KVM: injection failed, SMI lost (%s)\n",
4866                         strerror(-ret));
4867             }
4868         }
4869     }
4870 
4871     if (!kvm_pic_in_kernel()) {
4872         bql_lock();
4873     }
4874 
4875     /* Force the VCPU out of its inner loop to process any INIT requests
4876      * or (for userspace APIC, but it is cheap to combine the checks here)
4877      * pending TPR access reports.
4878      */
4879     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
4880         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
4881             !(env->hflags & HF_SMM_MASK)) {
4882             cpu->exit_request = 1;
4883         }
4884         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
4885             cpu->exit_request = 1;
4886         }
4887     }
4888 
4889     if (!kvm_pic_in_kernel()) {
4890         /* Try to inject an interrupt if the guest can accept it */
4891         if (run->ready_for_interrupt_injection &&
4892             (cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
4893             (env->eflags & IF_MASK)) {
4894             int irq;
4895 
4896             cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
4897             irq = cpu_get_pic_interrupt(env);
4898             if (irq >= 0) {
4899                 struct kvm_interrupt intr;
4900 
4901                 intr.irq = irq;
4902                 DPRINTF("injected interrupt %d\n", irq);
4903                 ret = kvm_vcpu_ioctl(cpu, KVM_INTERRUPT, &intr);
4904                 if (ret < 0) {
4905                     fprintf(stderr,
4906                             "KVM: injection failed, interrupt lost (%s)\n",
4907                             strerror(-ret));
4908                 }
4909             }
4910         }
4911 
4912         /* If we have an interrupt but the guest is not ready to receive an
4913          * interrupt, request an interrupt window exit.  This will
4914          * cause a return to userspace as soon as the guest is ready to
4915          * receive interrupts. */
4916         if ((cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
4917             run->request_interrupt_window = 1;
4918         } else {
4919             run->request_interrupt_window = 0;
4920         }
4921 
4922         DPRINTF("setting tpr\n");
4923         run->cr8 = cpu_get_apic_tpr(x86_cpu->apic_state);
4924 
4925         bql_unlock();
4926     }
4927 }
4928 
4929 static void kvm_rate_limit_on_bus_lock(void)
4930 {
4931     uint64_t delay_ns = ratelimit_calculate_delay(&bus_lock_ratelimit_ctrl, 1);
4932 
4933     if (delay_ns) {
4934         g_usleep(delay_ns / SCALE_US);
4935     }
4936 }
4937 
4938 MemTxAttrs kvm_arch_post_run(CPUState *cpu, struct kvm_run *run)
4939 {
4940     X86CPU *x86_cpu = X86_CPU(cpu);
4941     CPUX86State *env = &x86_cpu->env;
4942 
4943     if (run->flags & KVM_RUN_X86_SMM) {
4944         env->hflags |= HF_SMM_MASK;
4945     } else {
4946         env->hflags &= ~HF_SMM_MASK;
4947     }
4948     if (run->if_flag) {
4949         env->eflags |= IF_MASK;
4950     } else {
4951         env->eflags &= ~IF_MASK;
4952     }
4953     if (run->flags & KVM_RUN_X86_BUS_LOCK) {
4954         kvm_rate_limit_on_bus_lock();
4955     }
4956 
4957 #ifdef CONFIG_XEN_EMU
4958     /*
4959      * If the callback is asserted as a GSI (or PCI INTx) then check if
4960      * vcpu_info->evtchn_upcall_pending has been cleared, and deassert
4961      * the callback IRQ if so. Ideally we could hook into the PIC/IOAPIC
4962      * EOI and only resample then, exactly how the VFIO eventfd pairs
4963      * are designed to work for level triggered interrupts.
4964      */
4965     if (x86_cpu->env.xen_callback_asserted) {
4966         kvm_xen_maybe_deassert_callback(cpu);
4967     }
4968 #endif
4969 
4970     /* We need to protect the apic state against concurrent accesses from
4971      * different threads in case the userspace irqchip is used. */
4972     if (!kvm_irqchip_in_kernel()) {
4973         bql_lock();
4974     }
4975     cpu_set_apic_tpr(x86_cpu->apic_state, run->cr8);
4976     cpu_set_apic_base(x86_cpu->apic_state, run->apic_base);
4977     if (!kvm_irqchip_in_kernel()) {
4978         bql_unlock();
4979     }
4980     return cpu_get_mem_attrs(env);
4981 }
4982 
4983 int kvm_arch_process_async_events(CPUState *cs)
4984 {
4985     X86CPU *cpu = X86_CPU(cs);
4986     CPUX86State *env = &cpu->env;
4987 
4988     if (cs->interrupt_request & CPU_INTERRUPT_MCE) {
4989         /* We must not raise CPU_INTERRUPT_MCE if it's not supported. */
4990         assert(env->mcg_cap);
4991 
4992         cs->interrupt_request &= ~CPU_INTERRUPT_MCE;
4993 
4994         kvm_cpu_synchronize_state(cs);
4995 
4996         if (env->exception_nr == EXCP08_DBLE) {
4997             /* this means triple fault */
4998             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
4999             cs->exit_request = 1;
5000             return 0;
5001         }
5002         kvm_queue_exception(env, EXCP12_MCHK, 0, 0);
5003         env->has_error_code = 0;
5004 
5005         cs->halted = 0;
5006         if (kvm_irqchip_in_kernel() && env->mp_state == KVM_MP_STATE_HALTED) {
5007             env->mp_state = KVM_MP_STATE_RUNNABLE;
5008         }
5009     }
5010 
5011     if ((cs->interrupt_request & CPU_INTERRUPT_INIT) &&
5012         !(env->hflags & HF_SMM_MASK)) {
5013         kvm_cpu_synchronize_state(cs);
5014         do_cpu_init(cpu);
5015     }
5016 
5017     if (kvm_irqchip_in_kernel()) {
5018         return 0;
5019     }
5020 
5021     if (cs->interrupt_request & CPU_INTERRUPT_POLL) {
5022         cs->interrupt_request &= ~CPU_INTERRUPT_POLL;
5023         apic_poll_irq(cpu->apic_state);
5024     }
5025     if (((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
5026          (env->eflags & IF_MASK)) ||
5027         (cs->interrupt_request & CPU_INTERRUPT_NMI)) {
5028         cs->halted = 0;
5029     }
5030     if (cs->interrupt_request & CPU_INTERRUPT_SIPI) {
5031         kvm_cpu_synchronize_state(cs);
5032         do_cpu_sipi(cpu);
5033     }
5034     if (cs->interrupt_request & CPU_INTERRUPT_TPR) {
5035         cs->interrupt_request &= ~CPU_INTERRUPT_TPR;
5036         kvm_cpu_synchronize_state(cs);
5037         apic_handle_tpr_access_report(cpu->apic_state, env->eip,
5038                                       env->tpr_access_type);
5039     }
5040 
5041     return cs->halted;
5042 }
5043 
5044 static int kvm_handle_halt(X86CPU *cpu)
5045 {
5046     CPUState *cs = CPU(cpu);
5047     CPUX86State *env = &cpu->env;
5048 
5049     if (!((cs->interrupt_request & CPU_INTERRUPT_HARD) &&
5050           (env->eflags & IF_MASK)) &&
5051         !(cs->interrupt_request & CPU_INTERRUPT_NMI)) {
5052         cs->halted = 1;
5053         return EXCP_HLT;
5054     }
5055 
5056     return 0;
5057 }
5058 
5059 static int kvm_handle_tpr_access(X86CPU *cpu)
5060 {
5061     CPUState *cs = CPU(cpu);
5062     struct kvm_run *run = cs->kvm_run;
5063 
5064     apic_handle_tpr_access_report(cpu->apic_state, run->tpr_access.rip,
5065                                   run->tpr_access.is_write ? TPR_ACCESS_WRITE
5066                                                            : TPR_ACCESS_READ);
5067     return 1;
5068 }
5069 
5070 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
5071 {
5072     static const uint8_t int3 = 0xcc;
5073 
5074     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 0) ||
5075         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&int3, 1, 1)) {
5076         return -EINVAL;
5077     }
5078     return 0;
5079 }
5080 
5081 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
5082 {
5083     uint8_t int3;
5084 
5085     if (cpu_memory_rw_debug(cs, bp->pc, &int3, 1, 0)) {
5086         return -EINVAL;
5087     }
5088     if (int3 != 0xcc) {
5089         return 0;
5090     }
5091     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 1, 1)) {
5092         return -EINVAL;
5093     }
5094     return 0;
5095 }
5096 
5097 static struct {
5098     target_ulong addr;
5099     int len;
5100     int type;
5101 } hw_breakpoint[4];
5102 
5103 static int nb_hw_breakpoint;
5104 
5105 static int find_hw_breakpoint(target_ulong addr, int len, int type)
5106 {
5107     int n;
5108 
5109     for (n = 0; n < nb_hw_breakpoint; n++) {
5110         if (hw_breakpoint[n].addr == addr && hw_breakpoint[n].type == type &&
5111             (hw_breakpoint[n].len == len || len == -1)) {
5112             return n;
5113         }
5114     }
5115     return -1;
5116 }
5117 
5118 int kvm_arch_insert_hw_breakpoint(vaddr addr, vaddr len, int type)
5119 {
5120     switch (type) {
5121     case GDB_BREAKPOINT_HW:
5122         len = 1;
5123         break;
5124     case GDB_WATCHPOINT_WRITE:
5125     case GDB_WATCHPOINT_ACCESS:
5126         switch (len) {
5127         case 1:
5128             break;
5129         case 2:
5130         case 4:
5131         case 8:
5132             if (addr & (len - 1)) {
5133                 return -EINVAL;
5134             }
5135             break;
5136         default:
5137             return -EINVAL;
5138         }
5139         break;
5140     default:
5141         return -ENOSYS;
5142     }
5143 
5144     if (nb_hw_breakpoint == 4) {
5145         return -ENOBUFS;
5146     }
5147     if (find_hw_breakpoint(addr, len, type) >= 0) {
5148         return -EEXIST;
5149     }
5150     hw_breakpoint[nb_hw_breakpoint].addr = addr;
5151     hw_breakpoint[nb_hw_breakpoint].len = len;
5152     hw_breakpoint[nb_hw_breakpoint].type = type;
5153     nb_hw_breakpoint++;
5154 
5155     return 0;
5156 }
5157 
5158 int kvm_arch_remove_hw_breakpoint(vaddr addr, vaddr len, int type)
5159 {
5160     int n;
5161 
5162     n = find_hw_breakpoint(addr, (type == GDB_BREAKPOINT_HW) ? 1 : len, type);
5163     if (n < 0) {
5164         return -ENOENT;
5165     }
5166     nb_hw_breakpoint--;
5167     hw_breakpoint[n] = hw_breakpoint[nb_hw_breakpoint];
5168 
5169     return 0;
5170 }
5171 
5172 void kvm_arch_remove_all_hw_breakpoints(void)
5173 {
5174     nb_hw_breakpoint = 0;
5175 }
5176 
5177 static CPUWatchpoint hw_watchpoint;
5178 
5179 static int kvm_handle_debug(X86CPU *cpu,
5180                             struct kvm_debug_exit_arch *arch_info)
5181 {
5182     CPUState *cs = CPU(cpu);
5183     CPUX86State *env = &cpu->env;
5184     int ret = 0;
5185     int n;
5186 
5187     if (arch_info->exception == EXCP01_DB) {
5188         if (arch_info->dr6 & DR6_BS) {
5189             if (cs->singlestep_enabled) {
5190                 ret = EXCP_DEBUG;
5191             }
5192         } else {
5193             for (n = 0; n < 4; n++) {
5194                 if (arch_info->dr6 & (1 << n)) {
5195                     switch ((arch_info->dr7 >> (16 + n*4)) & 0x3) {
5196                     case 0x0:
5197                         ret = EXCP_DEBUG;
5198                         break;
5199                     case 0x1:
5200                         ret = EXCP_DEBUG;
5201                         cs->watchpoint_hit = &hw_watchpoint;
5202                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
5203                         hw_watchpoint.flags = BP_MEM_WRITE;
5204                         break;
5205                     case 0x3:
5206                         ret = EXCP_DEBUG;
5207                         cs->watchpoint_hit = &hw_watchpoint;
5208                         hw_watchpoint.vaddr = hw_breakpoint[n].addr;
5209                         hw_watchpoint.flags = BP_MEM_ACCESS;
5210                         break;
5211                     }
5212                 }
5213             }
5214         }
5215     } else if (kvm_find_sw_breakpoint(cs, arch_info->pc)) {
5216         ret = EXCP_DEBUG;
5217     }
5218     if (ret == 0) {
5219         cpu_synchronize_state(cs);
5220         assert(env->exception_nr == -1);
5221 
5222         /* pass to guest */
5223         kvm_queue_exception(env, arch_info->exception,
5224                             arch_info->exception == EXCP01_DB,
5225                             arch_info->dr6);
5226         env->has_error_code = 0;
5227     }
5228 
5229     return ret;
5230 }
5231 
5232 void kvm_arch_update_guest_debug(CPUState *cpu, struct kvm_guest_debug *dbg)
5233 {
5234     const uint8_t type_code[] = {
5235         [GDB_BREAKPOINT_HW] = 0x0,
5236         [GDB_WATCHPOINT_WRITE] = 0x1,
5237         [GDB_WATCHPOINT_ACCESS] = 0x3
5238     };
5239     const uint8_t len_code[] = {
5240         [1] = 0x0, [2] = 0x1, [4] = 0x3, [8] = 0x2
5241     };
5242     int n;
5243 
5244     if (kvm_sw_breakpoints_active(cpu)) {
5245         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
5246     }
5247     if (nb_hw_breakpoint > 0) {
5248         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
5249         dbg->arch.debugreg[7] = 0x0600;
5250         for (n = 0; n < nb_hw_breakpoint; n++) {
5251             dbg->arch.debugreg[n] = hw_breakpoint[n].addr;
5252             dbg->arch.debugreg[7] |= (2 << (n * 2)) |
5253                 (type_code[hw_breakpoint[n].type] << (16 + n*4)) |
5254                 ((uint32_t)len_code[hw_breakpoint[n].len] << (18 + n*4));
5255         }
5256     }
5257 }
5258 
5259 static bool kvm_install_msr_filters(KVMState *s)
5260 {
5261     uint64_t zero = 0;
5262     struct kvm_msr_filter filter = {
5263         .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
5264     };
5265     int r, i, j = 0;
5266 
5267     for (i = 0; i < KVM_MSR_FILTER_MAX_RANGES; i++) {
5268         KVMMSRHandlers *handler = &msr_handlers[i];
5269         if (handler->msr) {
5270             struct kvm_msr_filter_range *range = &filter.ranges[j++];
5271 
5272             *range = (struct kvm_msr_filter_range) {
5273                 .flags = 0,
5274                 .nmsrs = 1,
5275                 .base = handler->msr,
5276                 .bitmap = (__u8 *)&zero,
5277             };
5278 
5279             if (handler->rdmsr) {
5280                 range->flags |= KVM_MSR_FILTER_READ;
5281             }
5282 
5283             if (handler->wrmsr) {
5284                 range->flags |= KVM_MSR_FILTER_WRITE;
5285             }
5286         }
5287     }
5288 
5289     r = kvm_vm_ioctl(s, KVM_X86_SET_MSR_FILTER, &filter);
5290     if (r) {
5291         return false;
5292     }
5293 
5294     return true;
5295 }
5296 
5297 bool kvm_filter_msr(KVMState *s, uint32_t msr, QEMURDMSRHandler *rdmsr,
5298                     QEMUWRMSRHandler *wrmsr)
5299 {
5300     int i;
5301 
5302     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
5303         if (!msr_handlers[i].msr) {
5304             msr_handlers[i] = (KVMMSRHandlers) {
5305                 .msr = msr,
5306                 .rdmsr = rdmsr,
5307                 .wrmsr = wrmsr,
5308             };
5309 
5310             if (!kvm_install_msr_filters(s)) {
5311                 msr_handlers[i] = (KVMMSRHandlers) { };
5312                 return false;
5313             }
5314 
5315             return true;
5316         }
5317     }
5318 
5319     return false;
5320 }
5321 
5322 static int kvm_handle_rdmsr(X86CPU *cpu, struct kvm_run *run)
5323 {
5324     int i;
5325     bool r;
5326 
5327     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
5328         KVMMSRHandlers *handler = &msr_handlers[i];
5329         if (run->msr.index == handler->msr) {
5330             if (handler->rdmsr) {
5331                 r = handler->rdmsr(cpu, handler->msr,
5332                                    (uint64_t *)&run->msr.data);
5333                 run->msr.error = r ? 0 : 1;
5334                 return 0;
5335             }
5336         }
5337     }
5338 
5339     assert(false);
5340 }
5341 
5342 static int kvm_handle_wrmsr(X86CPU *cpu, struct kvm_run *run)
5343 {
5344     int i;
5345     bool r;
5346 
5347     for (i = 0; i < ARRAY_SIZE(msr_handlers); i++) {
5348         KVMMSRHandlers *handler = &msr_handlers[i];
5349         if (run->msr.index == handler->msr) {
5350             if (handler->wrmsr) {
5351                 r = handler->wrmsr(cpu, handler->msr, run->msr.data);
5352                 run->msr.error = r ? 0 : 1;
5353                 return 0;
5354             }
5355         }
5356     }
5357 
5358     assert(false);
5359 }
5360 
5361 static bool has_sgx_provisioning;
5362 
5363 static bool __kvm_enable_sgx_provisioning(KVMState *s)
5364 {
5365     int fd, ret;
5366 
5367     if (!kvm_vm_check_extension(s, KVM_CAP_SGX_ATTRIBUTE)) {
5368         return false;
5369     }
5370 
5371     fd = qemu_open_old("/dev/sgx_provision", O_RDONLY);
5372     if (fd < 0) {
5373         return false;
5374     }
5375 
5376     ret = kvm_vm_enable_cap(s, KVM_CAP_SGX_ATTRIBUTE, 0, fd);
5377     if (ret) {
5378         error_report("Could not enable SGX PROVISIONKEY: %s", strerror(-ret));
5379         exit(1);
5380     }
5381     close(fd);
5382     return true;
5383 }
5384 
5385 bool kvm_enable_sgx_provisioning(KVMState *s)
5386 {
5387     return MEMORIZE(__kvm_enable_sgx_provisioning(s), has_sgx_provisioning);
5388 }
5389 
5390 static bool host_supports_vmx(void)
5391 {
5392     uint32_t ecx, unused;
5393 
5394     host_cpuid(1, 0, &unused, &unused, &ecx, &unused);
5395     return ecx & CPUID_EXT_VMX;
5396 }
5397 
5398 /*
5399  * Currently the handling here only supports use of KVM_HC_MAP_GPA_RANGE
5400  * to service guest-initiated memory attribute update requests so that
5401  * KVM_SET_MEMORY_ATTRIBUTES can update whether or not a page should be
5402  * backed by the private memory pool provided by guest_memfd, and as such
5403  * is only applicable to guest_memfd-backed guests (e.g. SNP/TDX).
5404  *
5405  * Other other use-cases for KVM_HC_MAP_GPA_RANGE, such as for SEV live
5406  * migration, are not implemented here currently.
5407  *
5408  * For the guest_memfd use-case, these exits will generally be synthesized
5409  * by KVM based on platform-specific hypercalls, like GHCB requests in the
5410  * case of SEV-SNP, and not issued directly within the guest though the
5411  * KVM_HC_MAP_GPA_RANGE hypercall. So in this case, KVM_HC_MAP_GPA_RANGE is
5412  * not actually advertised to guests via the KVM CPUID feature bit, as
5413  * opposed to SEV live migration where it would be. Since it is unlikely the
5414  * SEV live migration use-case would be useful for guest-memfd backed guests,
5415  * because private/shared page tracking is already provided through other
5416  * means, these 2 use-cases should be treated as being mutually-exclusive.
5417  */
5418 static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
5419 {
5420     uint64_t gpa, size, attributes;
5421 
5422     if (!machine_require_guest_memfd(current_machine))
5423         return -EINVAL;
5424 
5425     gpa = run->hypercall.args[0];
5426     size = run->hypercall.args[1] * TARGET_PAGE_SIZE;
5427     attributes = run->hypercall.args[2];
5428 
5429     trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags);
5430 
5431     return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
5432 }
5433 
5434 static int kvm_handle_hypercall(struct kvm_run *run)
5435 {
5436     if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE)
5437         return kvm_handle_hc_map_gpa_range(run);
5438 
5439     return -EINVAL;
5440 }
5441 
5442 #define VMX_INVALID_GUEST_STATE 0x80000021
5443 
5444 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
5445 {
5446     X86CPU *cpu = X86_CPU(cs);
5447     uint64_t code;
5448     int ret;
5449     bool ctx_invalid;
5450     KVMState *state;
5451 
5452     switch (run->exit_reason) {
5453     case KVM_EXIT_HLT:
5454         DPRINTF("handle_hlt\n");
5455         bql_lock();
5456         ret = kvm_handle_halt(cpu);
5457         bql_unlock();
5458         break;
5459     case KVM_EXIT_SET_TPR:
5460         ret = 0;
5461         break;
5462     case KVM_EXIT_TPR_ACCESS:
5463         bql_lock();
5464         ret = kvm_handle_tpr_access(cpu);
5465         bql_unlock();
5466         break;
5467     case KVM_EXIT_FAIL_ENTRY:
5468         code = run->fail_entry.hardware_entry_failure_reason;
5469         fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n",
5470                 code);
5471         if (host_supports_vmx() && code == VMX_INVALID_GUEST_STATE) {
5472             fprintf(stderr,
5473                     "\nIf you're running a guest on an Intel machine without "
5474                         "unrestricted mode\n"
5475                     "support, the failure can be most likely due to the guest "
5476                         "entering an invalid\n"
5477                     "state for Intel VT. For example, the guest maybe running "
5478                         "in big real mode\n"
5479                     "which is not supported on less recent Intel processors."
5480                         "\n\n");
5481         }
5482         ret = -1;
5483         break;
5484     case KVM_EXIT_EXCEPTION:
5485         fprintf(stderr, "KVM: exception %d exit (error code 0x%x)\n",
5486                 run->ex.exception, run->ex.error_code);
5487         ret = -1;
5488         break;
5489     case KVM_EXIT_DEBUG:
5490         DPRINTF("kvm_exit_debug\n");
5491         bql_lock();
5492         ret = kvm_handle_debug(cpu, &run->debug.arch);
5493         bql_unlock();
5494         break;
5495     case KVM_EXIT_HYPERV:
5496         ret = kvm_hv_handle_exit(cpu, &run->hyperv);
5497         break;
5498     case KVM_EXIT_IOAPIC_EOI:
5499         ioapic_eoi_broadcast(run->eoi.vector);
5500         ret = 0;
5501         break;
5502     case KVM_EXIT_X86_BUS_LOCK:
5503         /* already handled in kvm_arch_post_run */
5504         ret = 0;
5505         break;
5506     case KVM_EXIT_NOTIFY:
5507         ctx_invalid = !!(run->notify.flags & KVM_NOTIFY_CONTEXT_INVALID);
5508         state = KVM_STATE(current_accel());
5509         if (ctx_invalid ||
5510             state->notify_vmexit == NOTIFY_VMEXIT_OPTION_INTERNAL_ERROR) {
5511             warn_report("KVM internal error: Encountered a notify exit "
5512                         "with invalid context in guest.");
5513             ret = -1;
5514         } else {
5515             warn_report_once("KVM: Encountered a notify exit with valid "
5516                              "context in guest. "
5517                              "The guest could be misbehaving.");
5518             ret = 0;
5519         }
5520         break;
5521     case KVM_EXIT_X86_RDMSR:
5522         /* We only enable MSR filtering, any other exit is bogus */
5523         assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER);
5524         ret = kvm_handle_rdmsr(cpu, run);
5525         break;
5526     case KVM_EXIT_X86_WRMSR:
5527         /* We only enable MSR filtering, any other exit is bogus */
5528         assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER);
5529         ret = kvm_handle_wrmsr(cpu, run);
5530         break;
5531 #ifdef CONFIG_XEN_EMU
5532     case KVM_EXIT_XEN:
5533         ret = kvm_xen_handle_exit(cpu, &run->xen);
5534         break;
5535 #endif
5536     case KVM_EXIT_HYPERCALL:
5537         ret = kvm_handle_hypercall(run);
5538         break;
5539     default:
5540         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
5541         ret = -1;
5542         break;
5543     }
5544 
5545     return ret;
5546 }
5547 
5548 bool kvm_arch_stop_on_emulation_error(CPUState *cs)
5549 {
5550     X86CPU *cpu = X86_CPU(cs);
5551     CPUX86State *env = &cpu->env;
5552 
5553     kvm_cpu_synchronize_state(cs);
5554     return !(env->cr[0] & CR0_PE_MASK) ||
5555            ((env->segs[R_CS].selector  & 3) != 3);
5556 }
5557 
5558 void kvm_arch_init_irq_routing(KVMState *s)
5559 {
5560     /* We know at this point that we're using the in-kernel
5561      * irqchip, so we can use irqfds, and on x86 we know
5562      * we can use msi via irqfd and GSI routing.
5563      */
5564     kvm_msi_via_irqfd_allowed = true;
5565     kvm_gsi_routing_allowed = true;
5566 
5567     if (kvm_irqchip_is_split()) {
5568         KVMRouteChange c = kvm_irqchip_begin_route_changes(s);
5569         int i;
5570 
5571         /* If the ioapic is in QEMU and the lapics are in KVM, reserve
5572            MSI routes for signaling interrupts to the local apics. */
5573         for (i = 0; i < IOAPIC_NUM_PINS; i++) {
5574             if (kvm_irqchip_add_msi_route(&c, 0, NULL) < 0) {
5575                 error_report("Could not enable split IRQ mode.");
5576                 exit(1);
5577             }
5578         }
5579         kvm_irqchip_commit_route_changes(&c);
5580     }
5581 }
5582 
5583 int kvm_arch_irqchip_create(KVMState *s)
5584 {
5585     int ret;
5586     if (kvm_kernel_irqchip_split()) {
5587         ret = kvm_vm_enable_cap(s, KVM_CAP_SPLIT_IRQCHIP, 0, 24);
5588         if (ret) {
5589             error_report("Could not enable split irqchip mode: %s",
5590                          strerror(-ret));
5591             exit(1);
5592         } else {
5593             DPRINTF("Enabled KVM_CAP_SPLIT_IRQCHIP\n");
5594             kvm_split_irqchip = true;
5595             return 1;
5596         }
5597     } else {
5598         return 0;
5599     }
5600 }
5601 
5602 uint64_t kvm_swizzle_msi_ext_dest_id(uint64_t address)
5603 {
5604     CPUX86State *env;
5605     uint64_t ext_id;
5606 
5607     if (!first_cpu) {
5608         return address;
5609     }
5610     env = &X86_CPU(first_cpu)->env;
5611     if (!(env->features[FEAT_KVM] & (1 << KVM_FEATURE_MSI_EXT_DEST_ID))) {
5612         return address;
5613     }
5614 
5615     /*
5616      * If the remappable format bit is set, or the upper bits are
5617      * already set in address_hi, or the low extended bits aren't
5618      * there anyway, do nothing.
5619      */
5620     ext_id = address & (0xff << MSI_ADDR_DEST_IDX_SHIFT);
5621     if (!ext_id || (ext_id & (1 << MSI_ADDR_DEST_IDX_SHIFT)) || (address >> 32)) {
5622         return address;
5623     }
5624 
5625     address &= ~ext_id;
5626     address |= ext_id << 35;
5627     return address;
5628 }
5629 
5630 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
5631                              uint64_t address, uint32_t data, PCIDevice *dev)
5632 {
5633     X86IOMMUState *iommu = x86_iommu_get_default();
5634 
5635     if (iommu) {
5636         X86IOMMUClass *class = X86_IOMMU_DEVICE_GET_CLASS(iommu);
5637 
5638         if (class->int_remap) {
5639             int ret;
5640             MSIMessage src, dst;
5641 
5642             src.address = route->u.msi.address_hi;
5643             src.address <<= VTD_MSI_ADDR_HI_SHIFT;
5644             src.address |= route->u.msi.address_lo;
5645             src.data = route->u.msi.data;
5646 
5647             ret = class->int_remap(iommu, &src, &dst, dev ?     \
5648                                    pci_requester_id(dev) :      \
5649                                    X86_IOMMU_SID_INVALID);
5650             if (ret) {
5651                 trace_kvm_x86_fixup_msi_error(route->gsi);
5652                 return 1;
5653             }
5654 
5655             /*
5656              * Handled untranslated compatibility format interrupt with
5657              * extended destination ID in the low bits 11-5. */
5658             dst.address = kvm_swizzle_msi_ext_dest_id(dst.address);
5659 
5660             route->u.msi.address_hi = dst.address >> VTD_MSI_ADDR_HI_SHIFT;
5661             route->u.msi.address_lo = dst.address & VTD_MSI_ADDR_LO_MASK;
5662             route->u.msi.data = dst.data;
5663             return 0;
5664         }
5665     }
5666 
5667 #ifdef CONFIG_XEN_EMU
5668     if (xen_mode == XEN_EMULATE) {
5669         int handled = xen_evtchn_translate_pirq_msi(route, address, data);
5670 
5671         /*
5672          * If it was a PIRQ and successfully routed (handled == 0) or it was
5673          * an error (handled < 0), return. If it wasn't a PIRQ, keep going.
5674          */
5675         if (handled <= 0) {
5676             return handled;
5677         }
5678     }
5679 #endif
5680 
5681     address = kvm_swizzle_msi_ext_dest_id(address);
5682     route->u.msi.address_hi = address >> VTD_MSI_ADDR_HI_SHIFT;
5683     route->u.msi.address_lo = address & VTD_MSI_ADDR_LO_MASK;
5684     return 0;
5685 }
5686 
5687 typedef struct MSIRouteEntry MSIRouteEntry;
5688 
5689 struct MSIRouteEntry {
5690     PCIDevice *dev;             /* Device pointer */
5691     int vector;                 /* MSI/MSIX vector index */
5692     int virq;                   /* Virtual IRQ index */
5693     QLIST_ENTRY(MSIRouteEntry) list;
5694 };
5695 
5696 /* List of used GSI routes */
5697 static QLIST_HEAD(, MSIRouteEntry) msi_route_list = \
5698     QLIST_HEAD_INITIALIZER(msi_route_list);
5699 
5700 void kvm_update_msi_routes_all(void *private, bool global,
5701                                uint32_t index, uint32_t mask)
5702 {
5703     int cnt = 0, vector;
5704     MSIRouteEntry *entry;
5705     MSIMessage msg;
5706     PCIDevice *dev;
5707 
5708     /* TODO: explicit route update */
5709     QLIST_FOREACH(entry, &msi_route_list, list) {
5710         cnt++;
5711         vector = entry->vector;
5712         dev = entry->dev;
5713         if (msix_enabled(dev) && !msix_is_masked(dev, vector)) {
5714             msg = msix_get_message(dev, vector);
5715         } else if (msi_enabled(dev) && !msi_is_masked(dev, vector)) {
5716             msg = msi_get_message(dev, vector);
5717         } else {
5718             /*
5719              * Either MSI/MSIX is disabled for the device, or the
5720              * specific message was masked out.  Skip this one.
5721              */
5722             continue;
5723         }
5724         kvm_irqchip_update_msi_route(kvm_state, entry->virq, msg, dev);
5725     }
5726     kvm_irqchip_commit_routes(kvm_state);
5727     trace_kvm_x86_update_msi_routes(cnt);
5728 }
5729 
5730 int kvm_arch_add_msi_route_post(struct kvm_irq_routing_entry *route,
5731                                 int vector, PCIDevice *dev)
5732 {
5733     static bool notify_list_inited = false;
5734     MSIRouteEntry *entry;
5735 
5736     if (!dev) {
5737         /* These are (possibly) IOAPIC routes only used for split
5738          * kernel irqchip mode, while what we are housekeeping are
5739          * PCI devices only. */
5740         return 0;
5741     }
5742 
5743     entry = g_new0(MSIRouteEntry, 1);
5744     entry->dev = dev;
5745     entry->vector = vector;
5746     entry->virq = route->gsi;
5747     QLIST_INSERT_HEAD(&msi_route_list, entry, list);
5748 
5749     trace_kvm_x86_add_msi_route(route->gsi);
5750 
5751     if (!notify_list_inited) {
5752         /* For the first time we do add route, add ourselves into
5753          * IOMMU's IEC notify list if needed. */
5754         X86IOMMUState *iommu = x86_iommu_get_default();
5755         if (iommu) {
5756             x86_iommu_iec_register_notifier(iommu,
5757                                             kvm_update_msi_routes_all,
5758                                             NULL);
5759         }
5760         notify_list_inited = true;
5761     }
5762     return 0;
5763 }
5764 
5765 int kvm_arch_release_virq_post(int virq)
5766 {
5767     MSIRouteEntry *entry, *next;
5768     QLIST_FOREACH_SAFE(entry, &msi_route_list, list, next) {
5769         if (entry->virq == virq) {
5770             trace_kvm_x86_remove_msi_route(virq);
5771             QLIST_REMOVE(entry, list);
5772             g_free(entry);
5773             break;
5774         }
5775     }
5776     return 0;
5777 }
5778 
5779 int kvm_arch_msi_data_to_gsi(uint32_t data)
5780 {
5781     abort();
5782 }
5783 
5784 bool kvm_has_waitpkg(void)
5785 {
5786     return has_msr_umwait;
5787 }
5788 
5789 #define ARCH_REQ_XCOMP_GUEST_PERM       0x1025
5790 
5791 void kvm_request_xsave_components(X86CPU *cpu, uint64_t mask)
5792 {
5793     KVMState *s = kvm_state;
5794     uint64_t supported;
5795 
5796     mask &= XSTATE_DYNAMIC_MASK;
5797     if (!mask) {
5798         return;
5799     }
5800     /*
5801      * Just ignore bits that are not in CPUID[EAX=0xD,ECX=0].
5802      * ARCH_REQ_XCOMP_GUEST_PERM would fail, and QEMU has warned
5803      * about them already because they are not supported features.
5804      */
5805     supported = kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EAX);
5806     supported |= (uint64_t)kvm_arch_get_supported_cpuid(s, 0xd, 0, R_EDX) << 32;
5807     mask &= supported;
5808 
5809     while (mask) {
5810         int bit = ctz64(mask);
5811         int rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_GUEST_PERM, bit);
5812         if (rc) {
5813             /*
5814              * Older kernel version (<5.17) do not support
5815              * ARCH_REQ_XCOMP_GUEST_PERM, but also do not return
5816              * any dynamic feature from kvm_arch_get_supported_cpuid.
5817              */
5818             warn_report("prctl(ARCH_REQ_XCOMP_GUEST_PERM) failure "
5819                         "for feature bit %d", bit);
5820         }
5821         mask &= ~BIT_ULL(bit);
5822     }
5823 }
5824 
5825 static int kvm_arch_get_notify_vmexit(Object *obj, Error **errp)
5826 {
5827     KVMState *s = KVM_STATE(obj);
5828     return s->notify_vmexit;
5829 }
5830 
5831 static void kvm_arch_set_notify_vmexit(Object *obj, int value, Error **errp)
5832 {
5833     KVMState *s = KVM_STATE(obj);
5834 
5835     if (s->fd != -1) {
5836         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
5837         return;
5838     }
5839 
5840     s->notify_vmexit = value;
5841 }
5842 
5843 static void kvm_arch_get_notify_window(Object *obj, Visitor *v,
5844                                        const char *name, void *opaque,
5845                                        Error **errp)
5846 {
5847     KVMState *s = KVM_STATE(obj);
5848     uint32_t value = s->notify_window;
5849 
5850     visit_type_uint32(v, name, &value, errp);
5851 }
5852 
5853 static void kvm_arch_set_notify_window(Object *obj, Visitor *v,
5854                                        const char *name, void *opaque,
5855                                        Error **errp)
5856 {
5857     KVMState *s = KVM_STATE(obj);
5858     uint32_t value;
5859 
5860     if (s->fd != -1) {
5861         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
5862         return;
5863     }
5864 
5865     if (!visit_type_uint32(v, name, &value, errp)) {
5866         return;
5867     }
5868 
5869     s->notify_window = value;
5870 }
5871 
5872 static void kvm_arch_get_xen_version(Object *obj, Visitor *v,
5873                                      const char *name, void *opaque,
5874                                      Error **errp)
5875 {
5876     KVMState *s = KVM_STATE(obj);
5877     uint32_t value = s->xen_version;
5878 
5879     visit_type_uint32(v, name, &value, errp);
5880 }
5881 
5882 static void kvm_arch_set_xen_version(Object *obj, Visitor *v,
5883                                      const char *name, void *opaque,
5884                                      Error **errp)
5885 {
5886     KVMState *s = KVM_STATE(obj);
5887     Error *error = NULL;
5888     uint32_t value;
5889 
5890     visit_type_uint32(v, name, &value, &error);
5891     if (error) {
5892         error_propagate(errp, error);
5893         return;
5894     }
5895 
5896     s->xen_version = value;
5897     if (value && xen_mode == XEN_DISABLED) {
5898         xen_mode = XEN_EMULATE;
5899     }
5900 }
5901 
5902 static void kvm_arch_get_xen_gnttab_max_frames(Object *obj, Visitor *v,
5903                                                const char *name, void *opaque,
5904                                                Error **errp)
5905 {
5906     KVMState *s = KVM_STATE(obj);
5907     uint16_t value = s->xen_gnttab_max_frames;
5908 
5909     visit_type_uint16(v, name, &value, errp);
5910 }
5911 
5912 static void kvm_arch_set_xen_gnttab_max_frames(Object *obj, Visitor *v,
5913                                                const char *name, void *opaque,
5914                                                Error **errp)
5915 {
5916     KVMState *s = KVM_STATE(obj);
5917     Error *error = NULL;
5918     uint16_t value;
5919 
5920     visit_type_uint16(v, name, &value, &error);
5921     if (error) {
5922         error_propagate(errp, error);
5923         return;
5924     }
5925 
5926     s->xen_gnttab_max_frames = value;
5927 }
5928 
5929 static void kvm_arch_get_xen_evtchn_max_pirq(Object *obj, Visitor *v,
5930                                              const char *name, void *opaque,
5931                                              Error **errp)
5932 {
5933     KVMState *s = KVM_STATE(obj);
5934     uint16_t value = s->xen_evtchn_max_pirq;
5935 
5936     visit_type_uint16(v, name, &value, errp);
5937 }
5938 
5939 static void kvm_arch_set_xen_evtchn_max_pirq(Object *obj, Visitor *v,
5940                                              const char *name, void *opaque,
5941                                              Error **errp)
5942 {
5943     KVMState *s = KVM_STATE(obj);
5944     Error *error = NULL;
5945     uint16_t value;
5946 
5947     visit_type_uint16(v, name, &value, &error);
5948     if (error) {
5949         error_propagate(errp, error);
5950         return;
5951     }
5952 
5953     s->xen_evtchn_max_pirq = value;
5954 }
5955 
5956 void kvm_arch_accel_class_init(ObjectClass *oc)
5957 {
5958     object_class_property_add_enum(oc, "notify-vmexit", "NotifyVMexitOption",
5959                                    &NotifyVmexitOption_lookup,
5960                                    kvm_arch_get_notify_vmexit,
5961                                    kvm_arch_set_notify_vmexit);
5962     object_class_property_set_description(oc, "notify-vmexit",
5963                                           "Enable notify VM exit");
5964 
5965     object_class_property_add(oc, "notify-window", "uint32",
5966                               kvm_arch_get_notify_window,
5967                               kvm_arch_set_notify_window,
5968                               NULL, NULL);
5969     object_class_property_set_description(oc, "notify-window",
5970                                           "Clock cycles without an event window "
5971                                           "after which a notification VM exit occurs");
5972 
5973     object_class_property_add(oc, "xen-version", "uint32",
5974                               kvm_arch_get_xen_version,
5975                               kvm_arch_set_xen_version,
5976                               NULL, NULL);
5977     object_class_property_set_description(oc, "xen-version",
5978                                           "Xen version to be emulated "
5979                                           "(in XENVER_version form "
5980                                           "e.g. 0x4000a for 4.10)");
5981 
5982     object_class_property_add(oc, "xen-gnttab-max-frames", "uint16",
5983                               kvm_arch_get_xen_gnttab_max_frames,
5984                               kvm_arch_set_xen_gnttab_max_frames,
5985                               NULL, NULL);
5986     object_class_property_set_description(oc, "xen-gnttab-max-frames",
5987                                           "Maximum number of grant table frames");
5988 
5989     object_class_property_add(oc, "xen-evtchn-max-pirq", "uint16",
5990                               kvm_arch_get_xen_evtchn_max_pirq,
5991                               kvm_arch_set_xen_evtchn_max_pirq,
5992                               NULL, NULL);
5993     object_class_property_set_description(oc, "xen-evtchn-max-pirq",
5994                                           "Maximum number of Xen PIRQs");
5995 }
5996 
5997 void kvm_set_max_apic_id(uint32_t max_apic_id)
5998 {
5999     kvm_vm_enable_cap(kvm_state, KVM_CAP_MAX_VCPU_ID, 0, max_apic_id);
6000 }
6001